diff --git a/.gitignore b/.gitignore
index 2d1e5d842da4..749197668afc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -108,3 +108,10 @@ scala-package/*/*/target/
 *.project
 *.settings
 !scala-package/*/bin
+*.bak
+*/node_modules/
+
+# Eclipse project config
+.project
+.cproject
+.pydevproject
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7613fe00375b..b81b1910c015 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,14 +10,17 @@ mxnet_option(USE_OPENMP  "Build with Openmp support" ON)
 mxnet_option(USE_CUDNN   "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
 mxnet_option(USE_CUDA    "Build with CUDA support"   ON)
 mxnet_option(USE_DIST_KVSTORE    "Build with DIST_KVSTORE support"   OFF)
+mxnet_option(USE_PLUGINS_WARPCTC	"Use WARPCTC Plugins" OFF)
 
 SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 
-
 include(mshadow/cmake/mshadow.cmake)
 include(mshadow/cmake/Utils.cmake)
 include(mshadow/cmake/Cuda.cmake)
 
+set(mxnet_LINKER_LIBS "")
+list(APPEND mxnet_LINKER_LIBS ${mshadow_LINKER_LIBS})
+
 include_directories("include")
 include_directories("mshadow")
 include_directories("dmlc-core/include")
@@ -29,7 +32,7 @@ if(MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
   add_definitions(-DMXNET_EXPORTS)
   set(CMAKE_C_FLAGS "/MP")
-  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} /bigobj")
 else(MSVC)
   include(CheckCXXCompilerFlag)
   check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
@@ -44,7 +47,7 @@ if(USE_OPENCV)
     find_package(OpenCV REQUIRED COMPONENTS core highgui imgproc)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
-  list(APPEND mshadow_LINKER_LIBS ${OpenCV_LIBS})
+  list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
   message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
   add_definitions(-DMXNET_USE_OPENCV=1)
 else(USE_OPENCV)
@@ -68,7 +71,7 @@ if(USE_CUDNN)
   if(HAVE_CUDNN)
     add_definitions(-DUSE_CUDNN)
     include_directories(SYSTEM ${CUDNN_INCLUDE})
-    list(APPEND mshadow_LINKER_LIBS ${CUDNN_LIBRARY})
+    list(APPEND mxnet_LINKER_LIBS ${CUDNN_LIBRARY})
 	add_definitions(-DMSHADOW_USE_CUDNN=1)
   endif()
 endif()
@@ -115,12 +118,22 @@ mxnet_source_group("Source\\resource.cc"   GLOB "src/resource.cc/*.cc")
 mxnet_source_group("Source\\storage"   GLOB "src/storage/*.cc")
 mxnet_source_group("Source\\symbol"   GLOB "src/symbol/*.cc")
 
-
-
-
-
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h")
-FILE(GLOB_RECURSE cuda "src/*.cu")
+FILE(GLOB_RECURSE CUDA "src/*.cu")
+
+if(USE_PLUGINS_WARPCTC)
+	set(WARPCTC_INCLUDE  "" CACHE PATH "WARPCTC include")
+	set(WARPCTC_LIB  "" CACHE FILEPATH "WARPCTC lib")
+	include_directories(SYSTEM ${WARPCTC_INCLUDE})
+	list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB})
+	mxnet_source_group("Include\\plugin\\warpctc"   GLOB "plugin/warpctc/*.h")
+	mxnet_source_group("Source\\plugin\\warpctc"   GLOB "plugin/warpctc/*.cc")
+	mxnet_source_group("Cuda\\plugin\\warpctc"   GLOB "plugin/warpctc/*.cu")
+	FILE(GLOB_RECURSE PLUGINS_SOURCE "plugin/warpctc/*.cc" "plugin/warpctc/*.h")
+	FILE(GLOB_RECURSE PLUGINS_CUSRC "plugin/warpctc/*.cu")
+	list(APPEND SOURCE ${PLUGINS_SOURCE})
+	list(APPEND CUDA ${PLUGINS_CUSRC})
+endif()
 
 if (NOT (EXTRA_OPERATORS STREQUAL ""))
 	mxnet_source_group("Extra"   GLOB_RECURSE "${EXTRA_OPERATORS}/*.cc")
@@ -142,19 +155,18 @@ endif()
 
 if(USE_CUDA)
   # define preprocessor macro so that we will not include the generated forcelink header
-  mshadow_cuda_compile(cuda_objs ${cuda})
+  mshadow_cuda_compile(cuda_objs ${CUDA})
   if(MSVC)
     FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-    list(APPEND mshadow_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
+    list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
     set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
-    list(APPEND mshadow_LINKER_LIBS ${CUDA_cuda_LIBRARY})
+    list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
   else(MSVC)
-    list(APPEND mshadow_LINKER_LIBS nvrtc cuda)
+    list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
   endif()
-  list(APPEND SOURCE ${cuda_objs} ${cuda})
+  list(APPEND SOURCE ${cuda_objs} ${CUDA})
 endif()
 
-
 if(NOT MSVC)
   # Only add c++11 flags and definitions after cuda compiling
   add_definitions(-DDMLC_USE_CXX11)
@@ -170,10 +182,9 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
 else()
   add_library(mxnet SHARED ${SOURCE})
 endif()
-target_link_libraries(mxnet ${mshadow_LINKER_LIBS})
+target_link_libraries(mxnet ${mxnet_LINKER_LIBS})
 target_link_libraries(mxnet dmlccore)
 
-
 if(MSVC)
   set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 endif()
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a2578ea469a0..ee6fbcf057d3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -107,3 +107,6 @@ List of Contributors
 * [Yuqi Li](https://github.com/ziyeqinghan)
 * [Depeng Liang](https://github.com/Ldpe2G)
 * [Kiko Qiu](https://github.com/kikoqiu)
+* [Yang Bo](https://github.com/Atry)
+* [Jonas Amaro](https://github.com/jonasrla)
+* [Yan Li](https://github.com/Godricly)
diff --git a/Makefile b/Makefile
index 1ef81159ad07..ccea1ba0a7b0 100644
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,6 @@ endif
 include $(config)
 include mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
-unexport NO_OPENMP
 
 # all tge possible warning tread
 WARNFLAGS= -Wall
@@ -37,9 +36,9 @@ endif
 CFLAGS += -I$(ROOTDIR)/mshadow/ -I$(ROOTDIR)/dmlc-core/include -fPIC -Iinclude $(MSHADOW_CFLAGS)
 LDFLAGS = -pthread $(MSHADOW_LDFLAGS) $(DMLC_LDFLAGS)
 ifeq ($(DEBUG), 1)
-	NVCCFLAGS = -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -G -O0 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 else
-	NVCCFLAGS = -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
+	NVCCFLAGS = -std=c++11 -Xcompiler -D_FORCE_INLINES -g -O3 -ccbin $(CXX) $(MSHADOW_NVCCFLAGS)
 endif
 
 ifndef LINT_LANG
@@ -150,8 +149,8 @@ endif
 
 build/src/%.o: src/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/src/$*.o $< >build/src/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -c $< -o $@
 
 build/src/%_gpu.o: src/%.cu
 	@mkdir -p $(@D)
@@ -160,20 +159,20 @@ build/src/%_gpu.o: src/%.cu
 
 build/plugin/%.o: plugin/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*.o $< >build/plugin/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -c $< -o $@
 
 # A nvcc bug cause it to generate "generic/xxx.h" dependencies from torch headers.
 # Use CXX to generate dependency instead.
 build/plugin/%_gpu.o: plugin/%.cu
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
+	$(CXX) -std=c++11 $(CFLAGS) -MM -MT build/plugin/$*_gpu.o $< >build/plugin/$*_gpu.d
 	$(NVCC) -c -o $@ $(NVCCFLAGS) -Xcompiler "$(CFLAGS)" $<
 
 $(EXTRA_OPERATORS)/build/%.o: $(EXTRA_OPERATORS)/%.cc
 	@mkdir -p $(@D)
-	$(CXX) -std=c++0x $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
-	$(CXX) -std=c++0x -c $(CFLAGS) -Isrc/operator -c $< -o $@
+	$(CXX) -std=c++11 $(CFLAGS) -Isrc/operator -MM -MT $(EXTRA_OPERATORS)/build/$*.o $< >$(EXTRA_OPERATORS)/build/$*.d
+	$(CXX) -std=c++11 -c $(CFLAGS) -Isrc/operator -c $< -o $@
 
 $(EXTRA_OPERATORS)/build/%_gpu.o: $(EXTRA_OPERATORS)/%.cu
 	@mkdir -p $(@D)
@@ -201,7 +200,7 @@ bin/im2rec: tools/im2rec.cc $(ALL_DEP)
 
 $(BIN) :
 	@mkdir -p $(@D)
-	$(CXX) $(CFLAGS) -std=c++0x  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
+	$(CXX) $(CFLAGS) -std=c++11  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
 
 include tests/cpp/unittest.mk
 
@@ -237,26 +236,26 @@ rpkg:	roxygen
 scalapkg:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn clean package -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
-											-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-											-Dlddeps="$(LIB_DEP)")
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)")
 
 scalatest:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn verify -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
-							 -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-							 -Dlddeps="$(LIB_DEP)" $(SCALA_TEST_ARGS))
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)" $(SCALA_TEST_ARGS))
 
 scalainstall:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn install -P$(SCALA_PKG_PROFILE) -DskipTests -Dcxx="$(CXX)" \
-							  -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-								-Dlddeps="$(LIB_DEP)")
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)")
 
 scaladeploy:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn deploy -Prelease,$(SCALA_PKG_PROFILE) -DskipTests -Dcxx="$(CXX)" \
-							 -Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-							 -Dlddeps="$(LIB_DEP)")
+			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
+			-Dlddeps="$(LIB_DEP)")
 
 jnilint:
 	python2 dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 355233385cbf..ad95fe050ef3 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -26,6 +26,9 @@ export(mx.exec.update.arg.arrays)
 export(mx.exec.update.aux.arrays)
 export(mx.exec.update.grad.arrays)
 export(mx.gpu)
+export(mx.gru)
+export(mx.gru.forward)
+export(mx.gru.inference)
 export(mx.init.Xavier)
 export(mx.init.create)
 export(mx.init.normal)
@@ -80,6 +83,9 @@ export(mx.nd.transpose)
 export(mx.nd.zeros)
 export(mx.opt.create)
 export(mx.opt.get.updater)
+export(mx.rnn)
+export(mx.rnn.forward)
+export(mx.rnn.inference)
 export(mx.rnorm)
 export(mx.runif)
 export(mx.set.seed)
diff --git a/R-package/R/gru.R b/R-package/R/gru.R
new file mode 100644
index 000000000000..d2ffd9a414c2
--- /dev/null
+++ b/R-package/R/gru.R
@@ -0,0 +1,355 @@
+# gru cell symbol
+gru <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
+    if (dropout > 0)
+        indata <- mx.symbol.Dropout(data=indata, p=dropout)
+    i2h <- mx.symbol.FullyConnected(data=indata,
+                                    weight=param$gates.i2h.weight,
+                                    bias=param$gates.i2h.bias,
+                                    num.hidden=num.hidden * 2,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
+    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
+                                    weight=param$gates.h2h.weight,
+                                    bias=param$gates.h2h.bias,
+                                    num.hidden=num.hidden * 2,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
+    gates <- i2h + h2h
+    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=2,
+                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
+    update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
+    reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type="sigmoid")
+
+    htrans.i2h <- mx.symbol.FullyConnected(data=indata,
+                                           weight=param$trans.i2h.weight,
+                                           bias=param$trans.i2h.bias,
+                                           num.hidden=num.hidden,
+                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
+    h.after.reset <- prev.state$h * reset.gate
+    htrans.h2h <- mx.symbol.FullyConnected(data=h.after.reset,
+                                           weight=param$trans.h2h.weight,
+                                           bias=param$trans.h2h.bias,
+                                           num.hidden=num.hidden,
+                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
+    h.trans <- htrans.i2h + htrans.h2h
+    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh")
+    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
+    return (list(h=next.h))
+}
+
+# unrolled gru network
+gru.unroll <- function(num.gru.layer, seq.len, input.size,
+                       num.hidden, num.embed, num.label, dropout=0) {
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+    param.cells <- lapply(1:num.gru.layer, function(i) {
+        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
+                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
+                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
+                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
+                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
+                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
+                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
+                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.gru.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    label <- mx.symbol.Variable("label")
+    data <- mx.symbol.Variable("data")
+    embed <- mx.symbol.Embedding(data=data, input.dim=input.size,
+                                 weight=embed.weight, output.dim=num.embed, name='embed')
+    wordvec <- mx.symbol.SliceChannel(data=embed, num.outputs=seq.len, squeeze.axis=1)
+
+    last.hidden <- list()
+    for (seqidx in 1:seq.len) {
+        hidden <- wordvec[[seqidx]]
+        # stack GRU
+        for (i in 1:num.gru.layer) {
+            dp <- ifelse(i==1, 0, dropout)
+            next.state <- gru(num.hidden, indata=hidden,
+                              prev.state=last.states[[i]],
+                              param=param.cells[[i]],
+                              seqidx=seqidx, layeridx=i, 
+                              dropout=dp)
+            hidden <- next.state$h
+            last.states[[i]] <- next.state
+        }
+        # decoder
+        if (dropout > 0)
+            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+        last.hidden <- c(last.hidden, hidden)
+    }
+    last.hidden$dim <- 0
+    last.hidden$num.args <- seq.len
+    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
+    fc <- mx.symbol.FullyConnected(data=concat,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num.hidden=num.label)
+
+    label <- mx.symbol.transpose(data=label)
+    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
+
+    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
+    return (loss.all)
+}
+
+# gru inference model symbol
+gru.inference.symbol <- function(num.gru.layer, seq.len, input.size,
+                                 num.hidden, num.embed, num.label, dropout=0) {
+    seqidx <- 1
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+
+    param.cells <- lapply(1:num.gru.layer, function(i) {
+        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
+                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
+                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
+                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
+                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
+                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
+                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
+                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.gru.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    data <- mx.symbol.Variable("data")
+    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                  weight=embed.weight, output_dim=num.embed, name="embed")
+
+    # stack GRU
+    for (i in 1:num.gru.layer) {
+        dp <- ifelse(i==1, 0, dropout)
+        next.state <- gru(num.hidden, indata=hidden,
+                          prev.state=last.states[[i]],
+                          param=param.cells[[i]],
+                          seqidx=seqidx, layeridx=i, 
+                          dropout=dp)
+        hidden <- next.state$h
+        last.states[[i]] <- next.state
+    }
+    # decoder
+    if (dropout > 0)
+        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+
+    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
+                                   weight=cls.weight, bias=cls.bias, name='pred')
+    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
+    unpack.h <- lapply(1:num.gru.layer, function(i) {
+        state <- last.states[[i]]
+        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
+        return (state.h)
+    })
+
+    list.all <- c(sm, unpack.h)
+    return (mx.symbol.Group(list.all))
+}
+
+#' Training GRU Unrolled Model
+#'
+#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
+#'      The Training set.
+#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
+#'      The validation set used for validation evaluation during the progress.
+#' @param num.gru.layer integer
+#'      The number of the layer of gru.
+#' @param seq.len integer
+#'      The length of the input sequence.
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer
+#'      The batch size used for R array training.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param ctx mx.context, optional
+#'      The device used to perform training.
+#' @param num.round integer, default=10
+#'      The number of iterations over training data to train the model.
+#' @param update.period integer, default=1
+#'      The number of iterations to update parameters during training period.
+#' @param initializer initializer object. default=mx.init.uniform(0.01)
+#'      The initialization scheme for parameters.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param optimizer string, default="sgd"
+#'      The optimization method.
+#' @param ... other parameters passing to \code{mx.gru}/.
+#' @return model A trained gru unrolled model.
+#'
+#' @export
+mx.gru <- function( train.data, eval.data=NULL,
+                    num.gru.layer, seq.len,
+                    num.hidden, num.embed, num.label,
+                    batch.size, input.size,
+                    ctx=mx.ctx.default(),
+                    num.round=10, update.period=1,
+                    initializer=mx.init.uniform(0.01),
+                    dropout=0, optimizer='sgd',
+                    ...) {
+    # check data and change data into iterator
+    train.data <- check.data(train.data, batch.size, TRUE)
+    eval.data <- check.data(eval.data, batch.size, FALSE)
+
+    # get unrolled gru symbol
+    rnn.sym <- gru.unroll( num.gru.layer=num.gru.layer,
+                           num.hidden=num.hidden,
+                           seq.len=seq.len,
+                           input.size=input.size,
+                           num.embed=num.embed,
+                           num.label=num.label,
+                           dropout=dropout)
+
+    init.states.name <- lapply(1:num.gru.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+
+    # set up gru model
+    model <- setup.rnn.model(rnn.sym=rnn.sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.gru.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=initializer,
+                             dropout=dropout)
+
+    # train gru model
+    model <- train.rnn( model, train.data, eval.data,
+                        num.round=num.round,
+                        update.period=update.period,
+                        ctx=ctx,
+                        init.states.name=init.states.name,
+                        ...)
+    # change model into MXFeedForwardModel
+    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
+    return(structure(model, class="MXFeedForwardModel"))
+}
+
+#' Create a GRU Inference Model
+#'
+#' @param num.gru.layer integer
+#'      The number of the layer of gru.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer, default=1
+#'      The batch size used for R array training.
+#' @param arg.params list
+#'      The batch size used for R array training.
+#' @param ctx mx.context, optional
+#'      Model parameter, list of name to NDArray of net's weights.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+#'      A gru inference model.
+#'
+#' @export
+mx.gru.inference <- function(num.gru.layer,
+                             input.size,
+                             num.hidden,
+                             num.embed,
+                             num.label,
+                             batch.size=1,
+                             arg.params,
+                             ctx=mx.cpu(),
+                             dropout=0.) {
+    sym <- gru.inference.symbol(num.gru.layer=num.gru.layer,
+                                 input.size=input.size,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=num.label,
+                                 dropout=dropout)
+
+    init.states.name <- lapply(1:num.gru.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+
+    seq.len <- 1
+    # set up gru model
+    model <- setup.rnn.model(rnn.sym=sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.gru.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=mx.init.uniform(0.01),
+                             dropout=dropout)
+    arg.names <- names(model$rnn.exec$ref.arg.arrays)
+    for (k in names(arg.params)) {
+        if ((k %in% arg.names) && is.param.name(k) ) {
+            rnn.input <- list()
+            rnn.input[[k]] <- arg.params[[k]]
+            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
+        }
+    }
+    init.states <- list()
+    for (i in 1:num.gru.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+
+    return (model)
+}
+
+#' Using forward function to predict in gru inference model
+#'
+#' @param model gru model
+#'      A gru inference model
+#' @param input.data, array.matrix
+#'      The input data for forward function
+#' @param new.seq boolean, default=FALSE
+#'      Whether the input is the start of a new sequence
+#'
+#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
+#'
+#' @export
+mx.gru.forward <- function(model, input.data, new.seq=FALSE) {
+    if (new.seq == TRUE) {
+        init.states <- list()
+        for (i in 1:model$num.rnn.layer) {
+            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+        }
+        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    }
+    dim(input.data) <- c(model$batch.size)
+    data <- list(data=mx.nd.array(input.data))
+    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
+    mx.exec.forward(model$rnn.exec, is.train=FALSE)
+    init.states <- list()
+    for (i in 1:model$num.rnn.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
+    return (list(prob=prob, model=model))
+}
+
diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R
index 3fcd0e831751..27c1c2e96eac 100644
--- a/R-package/R/lstm.R
+++ b/R-package/R/lstm.R
@@ -56,9 +56,7 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
 
     last.hidden <- list()
     for (seqidx in 1:seq.len) {
-
-        hidden = wordvec[[seqidx]]
-
+        hidden <- wordvec[[seqidx]]
         # stack lstm
         for (i in 1:num.lstm.layer) {
             dp <- ifelse(i==1, 0, dropout)
@@ -90,6 +88,7 @@ lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
     return (loss.all)
 }
 
+# lstm inference model symbol
 lstm.inference.symbol <- function(num.lstm.layer, input.size,
                                   num.hidden, num.embed, num.label, dropout=0.) {
     seqidx <- 0
@@ -99,9 +98,9 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size,
 
     param.cells <- lapply(1:num.lstm.layer, function(i) {
         cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                                 i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                                 h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                                 h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
         return (cell)
     })
     last.states <- lapply(1:num.lstm.layer, function(i) {
@@ -148,250 +147,7 @@ lstm.inference.symbol <- function(num.lstm.layer, input.size,
     return (mx.symbol.Group(list.all))
 }
 
-is.param.name <- function(name) {
-    return (grepl('weight$', name) || grepl('bias$', name) ||
-           grepl('gamma$', name) || grepl('beta$', name) )
-}
-
-# Initialize parameters
-mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) {
-  if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol")
-  slist <- symbol$infer.shape(input.shape)
-  if (is.null(slist)) stop("Not enough information to get shapes")
-  arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE)
-  aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE)
-  return(list(arg.params=arg.params, aux.params=aux.params))
-}
-
-# Initialize the data iter
-mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) {
-  if (is.MXDataIter(X)) return(X)
-  shape <- dim(data)
-  if (is.null(shape)) {
-    num.data <- length(X)
-  } else {
-    ndim <- length(shape)
-    num.data <- shape[[ndim]]
-  }
-  if (is.null(y)) {
-    if (is.train) stop("Need to provide parameter y for training with R arrays.")
-    y <- c(1:num.data) * 0
-  }
-
-  batch.size <- min(num.data, batch.size)
-
-  return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
-}
-
-# set up rnn model with lstm cells
-setup.rnn.model <- function(rnn.sym, ctx,
-                            num.lstm.layer, seq.len,
-                            num.hidden, num.embed, num.label,
-                            batch.size, input.size,
-                            initializer=mx.init.uniform(0.01),
-                            dropout=0) {
-
-    arg.names <- rnn.sym$arguments
-    input.shapes <- list()
-    for (name in arg.names) {
-        if (grepl('init.c$', name) || grepl('init.h$', name)) {
-            input.shapes[[name]] <- c(num.hidden, batch.size)
-        }
-        else if (grepl('data$', name) || grepl('label$', name) ) {
-            if (seq.len == 1) {
-                input.shapes[[name]] <- c(batch.size)
-            } else {
-            input.shapes[[name]] <- c(seq.len, batch.size)
-            }
-        }
-    }
-    params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu())
-    args <- input.shapes
-    args$symbol <- rnn.sym
-    args$ctx <- ctx
-    args$grad.req <- "add"
-    rnn.exec <- do.call(mx.simple.bind, args)
-
-    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
-    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
-
-    grad.arrays <- list()
-    for (name in names(rnn.exec$ref.grad.arrays)) {
-        if (is.param.name(name))
-            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
-    }
-    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
-
-    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
-                 num.lstm.layer=num.lstm.layer, num.hidden=num.hidden,
-                 seq.len=seq.len, batch.size=batch.size,
-                 num.embed=num.embed))
-
-}
-
-
-calc.nll <- function(seq.label.probs, batch.size) {
-    nll = - sum(log(seq.label.probs)) / batch.size
-    return (nll)
-}
-
-get.label <- function(label, ctx) {
-    label <- as.array(label)
-    seq.len <- dim(label)[[1]]
-    batch.size <- dim(label)[[2]]
-    sm.label <- array(0, dim=c(seq.len*batch.size))
-    for (seqidx in 1:seq.len) {
-        sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,]
-    }
-    return (mx.nd.array(sm.label, ctx))
-}
-
-
-
-train.lstm <- function(model, train.data, eval.data,
-                       num.round, update.period,
-                       optimizer='sgd', ctx=mx.ctx.default(), ...) {
-    m <- model
-    seq.len <- m$seq.len
-    batch.size <- m$batch.size
-    num.lstm.layer <- m$num.lstm.layer
-    num.hidden <- m$num.hidden
-
-    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
-
-    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
-    epoch.counter <- 0
-    log.period <- max(as.integer(1000 / seq.len), 1)
-    last.perp <- 10000000.0
-
-    for (iteration in 1:num.round) {
-        nbatch <- 0
-        train.nll <- 0
-        # reset states
-        init.states <- list()
-        for (i in 1:num.lstm.layer) {
-            init.states[[paste0("l", i, ".init.c")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-            init.states[[paste0("l", i, ".init.h")]] <- mx.nd.zeros(c(num.hidden, batch.size))
-        }
-        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-
-        tic <- Sys.time()
-
-        train.data$reset()
-
-        while (train.data$iter.next()) {
-            # set rnn input
-            rnn.input <- train.data$value()
-            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
-
-            mx.exec.forward(m$rnn.exec, is.train=TRUE)
-            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
-
-            mx.exec.backward(m$rnn.exec)
-            init.states <- list()
-            for (i in 1:num.lstm.layer) {
-                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-            # update epoch counter
-            epoch.counter <- epoch.counter + 1
-            if (epoch.counter %% update.period == 0) {
-                # the gradient of initial c and inital h should be zero
-                init.grad <- list()
-                for (i in 1:num.lstm.layer) {
-                    init.grad[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                    init.grad[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
-
-                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
-
-                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
-
-                grad.arrays <- list()
-                for (name in names(m$rnn.exec$ref.grad.arrays)) {
-                    if (is.param.name(name))
-                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
-
-            }
-
-            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size)
-
-            nbatch <- nbatch + seq.len
-            if ((epoch.counter %% log.period) == 0) {
-                cat(paste0("Epoch [", epoch.counter,
-                           "] Train: NLL=", train.nll / nbatch,
-                           ", Perp=", exp(train.nll / nbatch), "\n"))
-            }
-        }
-        train.data$reset()
-        # end of training loop
-        toc <- Sys.time()
-        cat(paste0("Iter [", iteration,
-                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
-                   " sec, NLL=", train.nll / nbatch,
-                   ", Perp=", exp(train.nll / nbatch), "\n"))
-
-        if (!is.null(eval.data)) {
-            val.nll <- 0.0
-            # validation set, reset states
-            init.states <- list()
-            for (i in 1:num.lstm.layer) {
-                init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-
-            eval.data$reset()
-            nbatch <- 0
-            while (eval.data$iter.next()) {
-                # set rnn input
-                rnn.input <- eval.data$value()
-                mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
-                mx.exec.forward(m$rnn.exec, is.train=FALSE)
-                # probability of each label class, used to evaluate nll
-                seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
-                # transfer the states
-                init.states <- list()
-                for (i in 1:num.lstm.layer) {
-                    init.states[[paste0("l", i, ".init.c")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-                    init.states[[paste0("l", i, ".init.h")]] <- m$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-                }
-                mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-                val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size)
-                nbatch <- nbatch + seq.len
-            }
-            eval.data$reset()
-            perp <- exp(val.nll / nbatch)
-            cat(paste0("Iter [", iteration,
-                       "] Val: NLL=", val.nll / nbatch,
-                       ", Perp=", exp(val.nll / nbatch), "\n"))
-        }
-    }
-
-    return (m)
-}
-
 
-check.data <- function(data, batch.size, is.train) {
-    if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) {
-        stop("The dataset should be either a mx.io.DataIter or a R list")
-    }
-    if (is.list(data)) {
-        if (is.null(data$data) || is.null(data$label)){
-            stop("Please provide dataset as list(data=R.array, label=R.array)")
-        }
-    data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train)
-    }
-    if (!is.null(data) && !data$iter.next()) {
-        data$reset()
-        if (!data$iter.next()) stop("Empty input")
-    }
-    return (data)
-}
 
 #' Training LSTM Unrolled Model
 #'
@@ -450,24 +206,36 @@ mx.lstm <- function(train.data, eval.data=NULL,
                            num.embed=num.embed,
                            num.label=num.label,
                            dropout=dropout)
+    init.states.c <- lapply(1:num.lstm.layer, function(i) {
+        state.c <- paste0("l", i, ".init.c")
+        return (state.c)
+    })
+    init.states.h <- lapply(1:num.lstm.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+    init.states.name <- c(init.states.c, init.states.h)
+
     # set up lstm model
     model <- setup.rnn.model(rnn.sym=rnn.sym,
                              ctx=ctx,
-                             num.lstm.layer=num.lstm.layer,
+                             num.rnn.layer=num.lstm.layer,
                              seq.len=seq.len,
                              num.hidden=num.hidden,
                              num.embed=num.embed,
                              num.label=num.label,
                              batch.size=batch.size,
                              input.size=input.size,
+                             init.states.name=init.states.name,
                              initializer=initializer,
                              dropout=dropout)
 
     # train lstm model
-    model <- train.lstm(model, train.data, eval.data,
+    model <- train.rnn( model, train.data, eval.data,
                         num.round=num.round,
                         update.period=update.period,
                         ctx=ctx,
+                        init.states.name=init.states.name,
                         ...)
     # change model into MXFeedForwardModel
     model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
@@ -487,7 +255,7 @@ mx.lstm <- function(train.data, eval.data=NULL,
 #'      The output dim of embedding.
 #' @param num.label  integer
 #'      The number of labels.
-#' @param batch.size integer
+#' @param batch.size integer, default=1
 #'      The batch size used for R array training.
 #' @param arg.params list
 #'      The batch size used for R array training.
@@ -495,7 +263,8 @@ mx.lstm <- function(train.data, eval.data=NULL,
 #'      Model parameter, list of name to NDArray of net's weights.
 #' @param dropout float, default=0
 #'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @return model a lstm inference model.
+#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+#'      A lstm inference model.
 #'
 #' @export
 mx.lstm.inference <- function(num.lstm.layer,
@@ -507,24 +276,35 @@ mx.lstm.inference <- function(num.lstm.layer,
                               arg.params,
                               ctx=mx.cpu(),
                               dropout=0.) {
-    sym <- lstm.inference.symbol(num.lstm.layer,
-                                 input.size,
-                                 num.hidden,
-                                 num.embed,
-                                 num.label,
-                                 dropout)
+    sym <- lstm.inference.symbol(num.lstm.layer=num.lstm.layer,
+                                 input.size=input.size,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=num.label,
+                                 dropout=dropout)
+
+    init.states.c <- lapply(1:num.lstm.layer, function(i) {
+        state.c <- paste0("l", i, ".init.c")
+        return (state.c)
+    })
+    init.states.h <- lapply(1:num.lstm.layer, function(i) {
+        state.h <- paste0("l", i, ".init.h")
+        return (state.h)
+    })
+    init.states.name <- c(init.states.c, init.states.h)
 
     seq.len <- 1
     # set up lstm model
     model <- setup.rnn.model(rnn.sym=sym,
                              ctx=ctx,
-                             num.lstm.layer=num.lstm.layer,
+                             num.rnn.layer=num.lstm.layer,
                              seq.len=seq.len,
                              num.hidden=num.hidden,
                              num.embed=num.embed,
                              num.label=num.label,
                              batch.size=batch.size,
                              input.size=input.size,
+                             init.states.name=init.states.name,
                              initializer=mx.init.uniform(0.01),
                              dropout=dropout)
     arg.names <- names(model$rnn.exec$ref.arg.arrays)
@@ -557,11 +337,10 @@ mx.lstm.inference <- function(num.lstm.layer,
 #' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
 #'
 #' @export
-
 mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
     if (new.seq == TRUE) {
         init.states <- list()
-        for (i in 1:num.lstm.layer) {
+        for (i in 1:model$num.rnn.layer) {
             init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
             init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
         }
@@ -572,7 +351,7 @@ mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
     mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
     mx.exec.forward(model$rnn.exec, is.train=FALSE)
     init.states <- list()
-    for (i in 1:num.lstm.layer) {
+    for (i in 1:model$num.rnn.layer) {
         init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]]
         init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
     }
diff --git a/R-package/R/mxnet_generated.R b/R-package/R/mxnet_generated.R
index 08393769a368..d8e32ad58ea9 100644
--- a/R-package/R/mxnet_generated.R
+++ b/R-package/R/mxnet_generated.R
@@ -22,6 +22,64 @@ NULL
 #' @name mx.nd.argmax.channel
 NULL
 
+#' Broadcast data in the given axis to the given size. The original size of the broadcasting axis must be 1.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.axis
+NULL
+
+#' lhs divide rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.div
+NULL
+
+#' lhs minus rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.minus
+NULL
+
+#' lhs multiple rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.mul
+NULL
+
+#' lhs add rhs with broadcast
+#' 
+#' @param lhs  NDArray
+#'     Left operand  to the function
+#' @param rhs  NDArray
+#'     Right operand to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.broadcast.plus
+NULL
+
 #' Take ceil value of the src
 #' 
 #' @param src  NDArray
@@ -68,6 +126,16 @@ NULL
 #' @name mx.nd.cos
 NULL
 
+#' Crop the input matrix and return a new one
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.crop
+NULL
+
 #' Calculate dot product of two matrices or two vectors
 #' 
 #' @param lhs  NDArray
@@ -90,6 +158,16 @@ NULL
 #' @name mx.nd.exp
 NULL
 
+#' Expand the shape of array by inserting a new axis.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.expand.dims
+NULL
+
 #' Fill one element of each line(row for python, column for R/Julia) in lhs according to index indicated by rhs and values indicated by mhs. This function assume rhs uses 0-based index.
 #' 
 #' @param lhs  NDArray
@@ -104,6 +182,16 @@ NULL
 #' @name mx.nd.fill.element.0index
 NULL
 
+#' Flip the input matrix along axis and return a new one
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.flip
+NULL
+
 #' Take floor value of the src
 #' 
 #' @param src  NDArray
@@ -124,7 +212,7 @@ NULL
 #' @name mx.nd.log
 NULL
 
-#' Take max of the src.The result will be ndarray of shape (1,) on the same device.
+#' Take max of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
@@ -134,7 +222,17 @@ NULL
 #' @name mx.nd.max
 NULL
 
-#' Take min of the src.The result will be ndarray of shape (1,) on the same device.
+#' (Depreciated! Use max instead!) Take max of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.max.axis
+NULL
+
+#' Take min of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
@@ -144,6 +242,16 @@ NULL
 #' @name mx.nd.min
 NULL
 
+#' (Depreciated! Use min instead!) Take min of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.min.axis
+NULL
+
 #' Take L2 norm of the src.The result will be ndarray of shape (1,) on the same device.
 #' 
 #' @param src  NDArray
@@ -194,6 +302,26 @@ NULL
 #' @name mx.nd.sin
 NULL
 
+#' Slice the input along certain axis and return a sliced array.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.slice.axis
+NULL
+
+#' Calculate Smooth L1 Loss(lhs, scalar)
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.smooth.l1
+NULL
+
 #' Calculate cross_entropy(lhs, one_hot(rhs))
 #' 
 #' @param lhs  NDArray
@@ -226,7 +354,7 @@ NULL
 #' @name mx.nd.square
 NULL
 
-#' Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+#' Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
 #' @param src  NDArray
 #'     Source input to the function
@@ -236,6 +364,16 @@ NULL
 #' @name mx.nd.sum
 NULL
 
+#' (Depreciated! Use sum instead!) Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
+#' 
+#' @param src  NDArray
+#'     Source input to the function
+#' @return out The result mx.ndarray
+#' 
+#' @export
+#' @name mx.nd.sum.axis
+NULL
+
 #' Transpose the input matrix and return a new one
 #' 
 #' @param src  NDArray
@@ -331,6 +469,8 @@ mx.io.CSVIter <- function(...) {
 #'     Dataset Param: Shape of each instance generated by the DataIter.
 #' @param inter.method  int, optional, default='1'
 #'     Augmentation Param: 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand.
+#' @param pad  int, optional, default='0'
+#'     Augmentation Param: Padding size.
 #' @param mirror  boolean, optional, default=False
 #'     Augmentation Param: Whether to mirror the image.
 #' @param rand.mirror  boolean, optional, default=False
@@ -505,6 +645,19 @@ mx.symbol.Crop <- function(...) {
   mx.varg.symbol.Crop(list(...))
 }
 
+#' Custom operator implemented in frontend.
+#' 
+#' @param op.type  string
+#'     Type of custom operator. Must be registered first.
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.Custom <- function(...) {
+  mx.varg.symbol.Custom(list(...))
+}
+
 #' Apply deconvolution to input then add a bias.
 #' 
 #' @param data  Symbol
@@ -518,7 +671,11 @@ mx.symbol.Crop <- function(...) {
 #' @param stride  Shape(tuple), optional, default=(1,1)
 #'     deconvolution stride: (y, x)
 #' @param pad  Shape(tuple), optional, default=(0,0)
-#'     pad for deconvolution: (y, x)
+#'     pad for deconvolution: (y, x), a good number is : (kernel-1)/2, if target_shape set, pad will be ignored and will be computed automatically
+#' @param adj  Shape(tuple), optional, default=(0,0)
+#'     adjustment for output shape: (y, x), if target_shape set, adj will be ignored and will be computed automatically
+#' @param target.shape  Shape(tuple), optional, default=(0,0)
+#'     output shape with targe shape : (y, x)
 #' @param num.filter  int (non-negative), required
 #'     deconvolution filter(channel) number
 #' @param num.group  int (non-negative), optional, default=1
@@ -586,7 +743,7 @@ mx.symbol.Embedding <- function(...) {
 #' Flatten input
 #' 
 #' @param data  Symbol
-#'     Input data to  flatten.
+#'     Input data to flatten.
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -745,10 +902,27 @@ mx.symbol.MAERegressionOutput <- function(...) {
   mx.varg.symbol.MAERegressionOutput(list(...))
 }
 
+#' Get output from a symbol and pass 1 gradient back. This is used as a terminal loss if unary and binary operator are used to composite a loss with no declaration of backward dependency
+#' 
+#' @param data  Symbol
+#'     Input data.
+#' @param grad.scale  float, optional, default=1
+#'     gradient scale as a supplement to unary and binary operators
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.MakeLoss <- function(...) {
+  mx.varg.symbol.MakeLoss(list(...))
+}
+
 #' Perform spatial pooling on inputs.
 #' 
 #' @param data  Symbol
 #'     Input data to the pooling operator.
+#' @param global.pool  boolean, optional, default=False
+#'     Ignore kernel size, do global pooling based on current input feature map. This is useful for input with different shape
 #' @param kernel  Shape(tuple), required
 #'     pooling kernel size: (y, x)
 #' @param pool.type  {'avg', 'max', 'sum'}, required
@@ -766,14 +940,16 @@ mx.symbol.Pooling <- function(...) {
   mx.varg.symbol.Pooling(list(...))
 }
 
-#' Resize regions of interest in an input plane to a fixed size by MAX pooling.
+#' Performs region-of-interest pooling on inputs. Resize bounding box coordinates by spatial_scale and crop input feature maps accordingly. The cropped feature maps are pooled by max pooling to a fixed size output indicated by pooled_size. batch_size will change to the number of region bounding boxes after ROIPooling
 #' 
-#' @param data  Symbol[]
-#'     [input tensor, regions of interest]
+#' @param data  Symbol
+#'     Input data to the pooling operator, a 4D Feature maps
+#' @param rois  Symbol
+#'     Bounding box coordinates, a 2D array of [[batch_index, x1, y1, x2, y2]]. (x1, y1) and (x2, y2) are top left and down right corners of designated region of interest. batch_index indicates the index of corresponding image in the input data
 #' @param pooled.size  Shape(tuple), required
-#'     target size: (h, w)
+#'     fix pooled size: (h, w)
 #' @param spatial.scale  float, required
-#'     Ratio of input plane height (or w) to raw image height (or w).
+#'     Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal of total stride in convolutional layers
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -786,11 +962,13 @@ mx.symbol.ROIPooling <- function(...) {
 #' Reshape input to target shape
 #' 
 #' @param data  Symbol
-#'     Input data to  reshape.
-#' @param target.shape  Shape(tuple), required
-#'     Target new shape. One and only one dim can be 0, in which case it will be inferred from the rest of dims
+#'     Input data to reshape.
+#' @param target.shape  Shape(tuple), optional, default=(0,0)
+#'     (Deprecated! Use shape instead.) Target new shape. One and only one dim can be 0, in which case it will be inferred from the rest of dims
 #' @param keep.highest  boolean, optional, default=False
-#'     Whether keep the highest dim unchanged.If set to yes, than the first dim in target_shape is ignored,and always fixed as input
+#'     (Deprecated! Use shape instead.) Whether keep the highest dim unchanged.If set to yes, than the first dim in target_shape is ignored,and always fixed as input
+#' @param shape  , optional, default=()
+#'     Target new shape. If the dim is same, set it to 0. If the dim is set to be -1, it will be inferred from the rest of dims. One and only one dim can be -1
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -829,6 +1007,8 @@ mx.symbol.SliceChannel <- function(...) {
 #'     If set to true, for a (n,k,x_1,..,x_n) dimensional input tensor, softmax will generate n*x_1*...*x_n output, each has k classes
 #' @param use.ignore  boolean, optional, default=False
 #'     If set to true, the ignore_label value will not contribute to the backward gradient
+#' @param normalization  {'batch', 'null', 'valid'},optional, default='null'
+#'     If set to null, op will do nothing on output gradient.If set to batch, op will normalize gradient by divide batch sizeIf set to valid, op will normalize gradient by divide sample not ignored
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -858,7 +1038,7 @@ mx.symbol.SoftmaxActivation <- function(...) {
 #' @param data  Symbol
 #'     Input data to softmax.
 #' @param label  Symbol
-#'     Label data.
+#'     Label data, can also be probability value with same shape as data
 #' @param grad.scale  float, optional, default=1
 #'     Scale the gradient by a float factor
 #' @param ignore.label  float, optional, default=-1
@@ -867,6 +1047,8 @@ mx.symbol.SoftmaxActivation <- function(...) {
 #'     If set to true, for a (n,k,x_1,..,x_n) dimensional input tensor, softmax will generate n*x_1*...*x_n output, each has k classes
 #' @param use.ignore  boolean, optional, default=False
 #'     If set to true, the ignore_label value will not contribute to the backward gradient
+#' @param normalization  {'batch', 'null', 'valid'},optional, default='null'
+#'     If set to null, op will do nothing on output gradient.If set to batch, op will normalize gradient by divide batch sizeIf set to valid, op will normalize gradient by divide sample not ignored
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -907,6 +1089,8 @@ mx.symbol.SwapAxis <- function(...) {
 #'     How to handle multiple input. concat means concatenate upsampled images along the channel dimension. sum means add all images together, only available for nearest neighbor upsampling.
 #' @param num.args  int, required
 #'     Number of inputs to be upsampled. For nearest neighbor upsampling, this can be 1-N; the size of output will be(scale*h_0,scale*w_0) and all other inputs will be upsampled to thesame size. For bilinear upsampling this must be 2; 1 input and 1 weight.
+#' @param workspace  long (non-negative), optional, default=512
+#'     Tmp workspace for deconvolution (MB)
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -918,24 +1102,93 @@ mx.symbol.UpSampling <- function(...) {
 
 #' Take absolute value of the src
 #' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.abs <- function(...) {
+  mx.varg.symbol.abs(list(...))
+}
+
+#' Broadcast data in the given axis to the given size. The original size of the broadcasting axis must be 1.
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_axis <- function(...) {
+  mx.varg.symbol.broadcast_axis(list(...))
+}
+
+#' lhs divide rhs with broadcast
+#' 
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_div <- function(...) {
+  mx.varg.symbol.broadcast_div(list(...))
+}
+
+#' lhs minus rhs with broadcast
+#' 
+#' @param lhs  Symbol
 #'     Left symbolic input to the function
+#' @param rhs  Symbol
+#'     Right symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
 #' 
 #' @export
-mx.symbol.abs <- function(...) {
-  mx.varg.symbol.abs(list(...))
+mx.symbol.broadcast_minus <- function(...) {
+  mx.varg.symbol.broadcast_minus(list(...))
 }
 
-#' Take ceil value of the src
+#' lhs multiple rhs with broadcast
+#' 
+#' @param lhs  Symbol
+#'     Left symbolic input to the function
+#' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_mul <- function(...) {
+  mx.varg.symbol.broadcast_mul(list(...))
+}
+
+#' lhs add rhs with broadcast
 #' 
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
+#'     Right symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.broadcast_plus <- function(...) {
+  mx.varg.symbol.broadcast_plus(list(...))
+}
+
+#' Take ceil value of the src
+#' 
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -948,9 +1201,7 @@ mx.symbol.ceil <- function(...) {
 
 #' Take cos of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -966,7 +1217,7 @@ mx.symbol.cos <- function(...) {
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
-#'     Left symbolic input to the function
+#'     Right symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -978,9 +1229,7 @@ mx.symbol.dot <- function(...) {
 
 #' Take exp of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -991,11 +1240,22 @@ mx.symbol.exp <- function(...) {
   mx.varg.symbol.exp(list(...))
 }
 
-#' Take floor value of the src
+#' Expand the shape of array by inserting a new axis.
 #' 
-#' @param lhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.expand_dims <- function(...) {
+  mx.varg.symbol.expand_dims(list(...))
+}
+
+#' Take floor value of the src
+#' 
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1008,9 +1268,7 @@ mx.symbol.floor <- function(...) {
 
 #' Take log of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1021,11 +1279,26 @@ mx.symbol.log <- function(...) {
   mx.varg.symbol.log(list(...))
 }
 
+#' Sample a normal distribution
+#' 
+#' @param loc  float, optional, default=0
+#'     Mean of the distribution.
+#' @param scale  float, optional, default=1
+#'     Standard deviation of the distribution.
+#' @param shape  Shape(tuple), required
+#'     The shape of the output
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.normal <- function(...) {
+  mx.varg.symbol.normal(list(...))
+}
+
 #' Take round value of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1038,9 +1311,7 @@ mx.symbol.round <- function(...) {
 
 #' Take rsqrt of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1053,9 +1324,7 @@ mx.symbol.rsqrt <- function(...) {
 
 #' Take sign value of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1068,9 +1337,7 @@ mx.symbol.sign <- function(...) {
 
 #' Take sin of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1081,12 +1348,38 @@ mx.symbol.sin <- function(...) {
   mx.varg.symbol.sin(list(...))
 }
 
+#' Slice the input along certain axis and return a sliced array.
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.slice_axis <- function(...) {
+  mx.varg.symbol.slice_axis(list(...))
+}
+
+#' Calculate Smooth L1 Loss(lhs, scalar)
+#' 
+#' @param src  Symbol
+#'     Left symbolic input to the function
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.smooth_l1 <- function(...) {
+  mx.varg.symbol.smooth_l1(list(...))
+}
+
 #' Calculate cross_entropy(lhs, one_hot(rhs))
 #' 
 #' @param lhs  Symbol
 #'     Left symbolic input to the function
 #' @param rhs  Symbol
-#'     Left symbolic input to the function
+#'     Right symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
 #' @return out The result mx.symbol
@@ -1098,9 +1391,7 @@ mx.symbol.softmax_cross_entropy <- function(...) {
 
 #' Take sqrt of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1113,9 +1404,7 @@ mx.symbol.sqrt <- function(...) {
 
 #' Take square of the src
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1126,11 +1415,9 @@ mx.symbol.square <- function(...) {
   mx.varg.symbol.square(list(...))
 }
 
-#' Take sum of the src.The result will be ndarray of shape (1,) on the same device.
+#' Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
-#' @param lhs  Symbol
-#'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1141,11 +1428,22 @@ mx.symbol.sum <- function(...) {
   mx.varg.symbol.sum(list(...))
 }
 
-#' Transpose the input matrix and return a new one
+#' (Depreciated! Use sum instead!) Take sum of the src in the given axis. Params: `axis` and `keepdims`. axis: tuple or integer of axes to reduce, global reduce will be performed if not set. keepdims: the same meaning as Numpy.
 #' 
-#' @param lhs  Symbol
+#' @param src  Symbol
 #'     Left symbolic input to the function
-#' @param rhs  Symbol
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.sum_axis <- function(...) {
+  mx.varg.symbol.sum_axis(list(...))
+}
+
+#' Transpose the input matrix and return a new one
+#' 
+#' @param src  Symbol
 #'     Left symbolic input to the function
 #' @param name  string, optional
 #'     Name of the resulting symbol.
@@ -1155,3 +1453,20 @@ mx.symbol.sum <- function(...) {
 mx.symbol.transpose <- function(...) {
   mx.varg.symbol.transpose(list(...))
 }
+
+#' Sample a uniform distribution
+#' 
+#' @param low  float, optional, default=0
+#'     The lower bound of distribution
+#' @param high  float, optional, default=1
+#'     The upper bound of distribution
+#' @param shape  Shape(tuple), required
+#'     The shape of the output
+#' @param name  string, optional
+#'     Name of the resulting symbol.
+#' @return out The result mx.symbol
+#' 
+#' @export
+mx.symbol.uniform <- function(...) {
+  mx.varg.symbol.uniform(list(...))
+}
diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 18c4d81aa9ac..36543931d1f6 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -66,6 +66,89 @@ mx.opt.sgd <- function(learning.rate,
   return(list(create.state=create.state, update=update))
 }
 
+#' Create an RMSProp optimizer with respective parameters.
+#' Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
+#' The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+#' 
+#' @param learning.rate float, default=0.002
+#'      Step size.
+#' @param gamma1 float, default=0.95
+#'      decay factor of moving average for gradient, gradient^2.
+#' @param gamm2 float, default=0.9
+#'      "momentum" factor.
+#' @param wd float, default=0.0
+#'      L2 regularization coefficient add to all the weights.
+#' @param rescale.grad float, default=1.0
+#'      rescaling factor of gradient.
+#' @param clip_gradient float, optional
+#'      clip gradient in range [-clip_gradient, clip_gradient].
+#' @param lr_scheduler function, optional
+#'      The learning rate scheduler.
+#'
+mx.opt.rmsprop <- function(learning.rate=0.002,
+                           gamma1=0.95,
+                           gamma2=0.9,
+                           wd=0,
+                           rescale.grad=1,
+                           clip_gradient = NULL, 
+                           lr_scheduler = NULL) {
+  # use lr as short for learing rate.
+  lr <- learning.rate
+  count       <- 0
+  num_update  <- 0
+
+  rmsprop <- new.env()
+  rmsprop$lr <- lr
+  rmsprop$count <- 0
+  rmsprop$num_update <- 0
+
+  create.state <- function(index, weight) {
+      return (list(n=mx.nd.zeros(dim(weight), ctx(weight)),
+                   g=mx.nd.zeros(dim(weight), ctx(weight)),
+                   delta=mx.nd.zeros(dim(weight), ctx(weight))))
+  }
+
+  update <- function(index, weight, grad, state) {
+    if (!is.null(lr_scheduler)){
+      lr_scheduler(rmsprop) ## changing lr
+      lr <- rmsprop$lr
+      ## update count
+      indexKey <- paste0('ik', index)
+      if (!exists(envir = rmsprop, x = indexKey)){
+        assign(x = indexKey, value = 0, envir = rmsprop)
+      } else {
+        indexValue <- get(envir = rmsprop, x = indexKey)
+        assign(x = indexKey, value = indexValue + 1, envir = rmsprop)
+        rmsprop$num_update <- max(rmsprop$num_update, get(envir = rmsprop, x = indexKey))
+      }
+    }
+    grad <- grad * rescale.grad
+    if (!is.null(clip_gradient)){
+      if(clip_gradient >= 0){
+          grad_ctx <- ctx(grad)
+          grad <- as.array(grad)
+          grad <- pmax(grad, -1 * clip_gradient)
+          grad <- pmin(grad, clip_gradient)
+          grad <- mx.nd.array(grad, grad_ctx)
+      } else {
+        stop("Error: clip_gradient should be positive number.")
+      }
+    }
+
+    n <- state$n
+    g <- state$g
+    delta <- state$delta
+    n <- gamma1 * n + (1 - gamma1) * (grad * grad)
+    g <- gamma1 * g + (1 - gamma1) * grad
+    delta <- gamma2 * delta - lr * (grad / mx.nd.sqrt(n - g*g + 1e-4) + wd * weight)
+    weight <- weight + delta
+    state <- list(n=n, g=g, delta=delta)
+
+    return(list(weight=weight, state=state))
+  }
+  return(list(create.state=create.state, update=update))
+}
+
 #' Create an optimizer by name and parameters
 #'
 #' @param name The name of the optimizer
@@ -76,6 +159,9 @@ mx.opt.create <- function(name, ...) {
   if (name == "sgd") {
     return(mx.opt.sgd(...))
   }
+  else if (name == "rmsprop") {
+    return (mx.opt.rmsprop(...))
+  }
   stop(paste("Unknown optimizer ", name))
 }
 
diff --git a/R-package/R/rnn.R b/R-package/R/rnn.R
new file mode 100644
index 000000000000..b89559a58570
--- /dev/null
+++ b/R-package/R/rnn.R
@@ -0,0 +1,342 @@
+# rnn cell symbol
+rnn <- function(num.hidden, indata, prev.state, param, seqidx, 
+                layeridx, dropout=0., batch.norm=FALSE) {
+    if (dropout > 0. )
+        indata <- mx.symbol.Dropout(data=indata, p=dropout)
+    i2h <- mx.symbol.FullyConnected(data=indata,
+                                    weight=param$i2h.weight,
+                                    bias=param$i2h.bias,
+                                    num.hidden=num.hidden,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
+    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
+                                    weight=param$h2h.weight,
+                                    bias=param$h2h.bias,
+                                    num.hidden=num.hidden,
+                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
+    hidden <- i2h + h2h
+
+    hidden <- mx.symbol.Activation(data=hidden, act.type="tanh")
+    if (batch.norm)
+        hidden <- mx.symbol.BatchNorm(data=hidden)
+    return (list(h=hidden))
+}
+
+# unrolled rnn network
+rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.hidden, 
+                       num.embed, num.label, dropout=0., batch.norm=FALSE) {
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+    param.cells <- lapply(1:num.rnn.layer, function(i) {
+        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.rnn.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    label <- mx.symbol.Variable("label")
+    data <- mx.symbol.Variable("data")
+    embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                 weight=embed.weight, output_dim=num.embed, name="embed")
+    wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1)
+
+    last.hidden <- list()
+    for (seqidx in 1:seq.len) { 
+        hidden <- wordvec[[seqidx]]
+        # stack RNN
+        for (i in 1:num.rnn.layer) {
+            dp <- ifelse(i==1, 0, dropout)
+            next.state <- rnn(num.hidden, indata=hidden,
+                              prev.state=last.states[[i]],
+                              param=param.cells[[i]],
+                              seqidx=seqidx, layeridx=i, 
+                              dropout=dp, batch.norm=batch.norm)
+            hidden <- next.state$h
+            last.states[[i]] <- next.state
+        }
+        # decoder
+        if (dropout > 0.)
+            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+        last.hidden <- c(last.hidden, hidden)
+    }
+    last.hidden$dim <- 0
+    last.hidden$num.args <- seq.len
+    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
+    fc <- mx.symbol.FullyConnected(data=concat,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num.hidden=num.label)
+    label <- mx.symbol.transpose(data=label)
+    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
+
+    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
+    return (loss.all)
+}
+
+# rnn inference model symbol
+rnn.inference.symbol <- function(num.rnn.layer, seq.len, input.size, num.hidden, 
+                                 num.embed, num.label, dropout=0., batch.norm=FALSE) {
+    seqidx <- 0
+    embed.weight <- mx.symbol.Variable("embed.weight")
+    cls.weight <- mx.symbol.Variable("cls.weight")
+    cls.bias <- mx.symbol.Variable("cls.bias")
+    param.cells <- lapply(1:num.rnn.layer, function(i) {
+        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+        return (cell)
+    })
+    last.states <- lapply(1:num.rnn.layer, function(i) {
+        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
+        return (state)
+    })
+
+    # embeding layer
+    data <- mx.symbol.Variable("data")
+    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                 weight=embed.weight, output_dim=num.embed, name="embed")
+    # stack RNN        
+    for (i in 1:num.rnn.layer) {
+        dp <- ifelse(i==1, 0, dropout)
+        next.state <- rnn(num.hidden, indata=hidden,
+                          prev.state=last.states[[i]],
+                          param=param.cells[[i]],
+                          seqidx=seqidx, layeridx=i, 
+                          dropout=dp, batch.norm=batch.norm)
+        hidden <- next.state$h
+        last.states[[i]] <- next.state
+    }
+    # decoder
+    if (dropout > 0.)
+        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
+
+    fc <- mx.symbol.FullyConnected(data=hidden,
+                                   weight=cls.weight,
+                                   bias=cls.bias,
+                                   num_hidden=num.label)
+    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
+    unpack.h <- lapply(1:num.rnn.layer, function(i) {
+        state <- last.states[[i]]
+        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
+        return (state.h)
+    })
+    list.all <- c(sm, unpack.h)
+    return (mx.symbol.Group(list.all))
+}
+
+#' Training RNN Unrolled Model
+#'
+#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
+#'      The Training set.
+#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
+#'      The validation set used for validation evaluation during the progress.
+#' @param num.rnn.layer integer
+#'      The number of the layer of rnn.
+#' @param seq.len integer
+#'      The length of the input sequence.
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer
+#'      The batch size used for R array training.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param ctx mx.context, optional
+#'      The device used to perform training.
+#' @param num.round integer, default=10
+#'      The number of iterations over training data to train the model.
+#' @param update.period integer, default=1
+#'      The number of iterations to update parameters during training period.
+#' @param initializer initializer object. default=mx.init.uniform(0.01)
+#'      The initialization scheme for parameters.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param optimizer string, default="sgd"
+#'      The optimization method.
+#' @param batch.norm boolean, default=FALSE
+#'      Whether to use batch normalization.
+#' @param ... other parameters passing to \code{mx.rnn}/.
+#' @return model A trained rnn unrolled model.
+#'
+#' @export
+mx.rnn <- function( train.data, eval.data=NULL,
+                    num.rnn.layer, seq.len,
+                    num.hidden, num.embed, num.label,
+                    batch.size, input.size,
+                    ctx=mx.ctx.default(),
+                    num.round=10, update.period=1,
+                    initializer=mx.init.uniform(0.01),
+                    dropout=0, optimizer='sgd',
+                    batch.norm=FALSE,
+                    ...) {
+    # check data and change data into iterator
+    train.data <- check.data(train.data, batch.size, TRUE)
+    eval.data <- check.data(eval.data, batch.size, FALSE)
+
+    # get unrolled rnn symbol
+    rnn.sym <- rnn.unroll( num.rnn.layer=num.rnn.layer,
+                           num.hidden=num.hidden,
+                           seq.len=seq.len,
+                           input.size=input.size,
+                           num.embed=num.embed,
+                           num.label=num.label,
+                           dropout=dropout,
+                           batch.norm=batch.norm)
+    init.states.name <- lapply(1:num.rnn.layer, function(i) {
+        state <- paste0("l", i, ".init.h")
+        return (state)
+    })
+    # set up rnn model
+    model <- setup.rnn.model(rnn.sym=rnn.sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.rnn.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=initializer,
+                             dropout=dropout)
+    # train rnn model
+    model <- train.rnn( model, train.data, eval.data,
+                        num.round=num.round,
+                        update.period=update.period,
+                        ctx=ctx,
+                        init.states.name=init.states.name,
+                        ...)
+    # change model into MXFeedForwardModel
+    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
+    return(structure(model, class="MXFeedForwardModel"))
+}
+
+#' Create a RNN Inference Model
+#'
+#' @param num.rnn.layer integer
+#'      The number of the layer of rnn.
+#' @param input.size integer
+#'       The input dim of one-hot encoding of embedding
+#' @param num.hidden integer
+#'      The number of hidden nodes.
+#' @param num.embed integer
+#'      The output dim of embedding.
+#' @param num.label  integer
+#'      The number of labels.
+#' @param batch.size integer, default=1
+#'      The batch size used for R array training.
+#' @param arg.params list
+#'      The batch size used for R array training.
+#' @param ctx mx.context, optional
+#'      Model parameter, list of name to NDArray of net's weights.
+#' @param dropout float, default=0
+#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
+#' @param batch.norm boolean, default=FALSE
+#'      Whether to use batch normalization.
+#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+#'      A rnn inference model.
+#'
+#' @export
+mx.rnn.inference <- function( num.rnn.layer,
+                              input.size,
+                              num.hidden,
+                              num.embed,
+                              num.label,
+                              batch.size=1,
+                              arg.params,
+                              ctx=mx.cpu(),
+                              dropout=0.,
+                              batch.norm=FALSE) {
+    sym <- rnn.inference.symbol( num.rnn.layer=num.rnn.layer,
+                                 input.size=input.size,
+                                 num.hidden=num.hidden,
+                                 num.embed=num.embed,
+                                 num.label=num.label,
+                                 dropout=dropout,
+                                 batch.norm=batch.norm)
+    # init.states.name <- c()
+    # for (i in 1:num.rnn.layer) {
+    #     init.states.name <- c(init.states.name, paste0("l", i, ".init.c"))
+    #     init.states.name <- c(init.states.name, paste0("l", i, ".init.h"))
+    # }
+    init.states.name <- lapply(1:num.rnn.layer, function(i) {
+        state <- paste0("l", i, ".init.h")
+        return (state)
+    })
+    
+    seq.len <- 1
+    # set up rnn model
+    model <- setup.rnn.model(rnn.sym=sym,
+                             ctx=ctx,
+                             num.rnn.layer=num.rnn.layer,
+                             seq.len=seq.len,
+                             num.hidden=num.hidden,
+                             num.embed=num.embed,
+                             num.label=num.label,
+                             batch.size=batch.size,
+                             input.size=input.size,
+                             init.states.name=init.states.name,
+                             initializer=mx.init.uniform(0.01),
+                             dropout=dropout)
+    arg.names <- names(model$rnn.exec$ref.arg.arrays)
+    for (k in names(arg.params)) {
+        if ((k %in% arg.names) && is.param.name(k) ) {
+            rnn.input <- list()
+            rnn.input[[k]] <- arg.params[[k]]
+            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
+        }
+    }
+    init.states <- list()
+    for (i in 1:num.rnn.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+
+    return (model)
+}
+
+#' Using forward function to predict in rnn inference model
+#'
+#' @param model rnn model
+#'      A rnn inference model
+#' @param input.data, array.matrix
+#'      The input data for forward function
+#' @param new.seq boolean, default=FALSE
+#'      Whether the input is the start of a new sequence
+#'
+#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
+#'
+#' @export
+mx.rnn.forward <- function(model, input.data, new.seq=FALSE) {
+    if (new.seq == TRUE) {
+        init.states <- list()
+        for (i in 1:model$num.rnn.layer) {
+            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
+        }
+        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    }
+    dim(input.data) <- c(model$batch.size)
+    data <- list(data=mx.nd.array(input.data))
+    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
+    mx.exec.forward(model$rnn.exec, is.train=FALSE)
+    init.states <- list()
+    for (i in 1:model$num.rnn.layer) {
+        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
+    }
+    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
+    #print (model$rnn.exec$ref)
+    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
+    print ("prob")
+    print (prob)
+    return (list(prob=prob, model=model))
+}
diff --git a/R-package/R/rnn_model.R b/R-package/R/rnn_model.R
new file mode 100644
index 000000000000..19f53b0f6407
--- /dev/null
+++ b/R-package/R/rnn_model.R
@@ -0,0 +1,244 @@
+is.param.name <- function(name) {
+    return (grepl('weight$', name) || grepl('bias$', name) ||
+           grepl('gamma$', name) || grepl('beta$', name) )
+}
+
+# Initialize parameters
+mx.model.init.params.rnn <- function(symbol, input.shape, initializer, ctx) {
+  if (!is.mx.symbol(symbol)) stop("symbol need to be MXSymbol")
+  slist <- symbol$infer.shape(input.shape)
+  if (is.null(slist)) stop("Not enough information to get shapes")
+  arg.params <- mx.init.create(initializer, slist$arg.shapes, ctx, skip.unknown=TRUE)
+  aux.params <- mx.init.create(initializer, slist$aux.shapes, ctx, skip.unknown=FALSE)
+  return(list(arg.params=arg.params, aux.params=aux.params))
+}
+
+# Initialize the data iter
+mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) {
+  if (is.MXDataIter(X)) return(X)
+  shape <- dim(data)
+  if (is.null(shape)) {
+    num.data <- length(X)
+  } else {
+    ndim <- length(shape)
+    num.data <- shape[[ndim]]
+  }
+  if (is.null(y)) {
+    if (is.train) stop("Need to provide parameter y for training with R arrays.")
+    y <- c(1:num.data) * 0
+  }
+
+  batch.size <- min(num.data, batch.size)
+
+  return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
+}
+
+# set up rnn model with rnn cells
+setup.rnn.model <- function(rnn.sym, ctx,
+                            num.rnn.layer, seq.len,
+                            num.hidden, num.embed, num.label,
+                            batch.size, input.size,
+                            init.states.name,
+                            initializer=mx.init.uniform(0.01),
+                            dropout=0) {
+
+    arg.names <- rnn.sym$arguments
+    input.shapes <- list()
+    for (name in arg.names) {
+        if (name %in% init.states.name) {
+            input.shapes[[name]] <- c(num.hidden, batch.size)
+        }
+        else if (grepl('data$', name) || grepl('label$', name) ) {
+            if (seq.len == 1) {
+                input.shapes[[name]] <- c(batch.size)
+            } else {
+            input.shapes[[name]] <- c(seq.len, batch.size)
+            }
+        }
+    }
+    params <- mx.model.init.params.rnn(rnn.sym, input.shapes, initializer, mx.cpu())
+    args <- input.shapes
+    args$symbol <- rnn.sym
+    args$ctx <- ctx
+    args$grad.req <- "add"
+    rnn.exec <- do.call(mx.simple.bind, args)
+
+    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
+    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
+
+    grad.arrays <- list()
+    for (name in names(rnn.exec$ref.grad.arrays)) {
+        if (is.param.name(name))
+            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
+    }
+    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
+
+    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
+                 num.rnn.layer=num.rnn.layer, num.hidden=num.hidden,
+                 seq.len=seq.len, batch.size=batch.size,
+                 num.embed=num.embed))
+
+}
+
+
+calc.nll <- function(seq.label.probs, batch.size) {
+    nll = - sum(log(seq.label.probs)) / batch.size
+    return (nll)
+}
+
+get.label <- function(label, ctx) {
+    label <- as.array(label)
+    seq.len <- dim(label)[[1]]
+    batch.size <- dim(label)[[2]]
+    sm.label <- array(0, dim=c(seq.len*batch.size))
+    for (seqidx in 1:seq.len) {
+        sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,]
+    }
+    return (mx.nd.array(sm.label, ctx))
+}
+
+
+# training rnn model
+train.rnn <- function (model, train.data, eval.data,
+                       num.round, update.period,
+                       init.states.name,
+                       optimizer='sgd', ctx=mx.ctx.default(), ...) {
+    m <- model
+    seq.len <- m$seq.len
+    batch.size <- m$batch.size
+    num.rnn.layer <- m$num.rnn.layer
+    num.hidden <- m$num.hidden
+
+    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
+
+    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
+    epoch.counter <- 0
+    log.period <- max(as.integer(1000 / seq.len), 1)
+    last.perp <- 10000000.0
+
+    for (iteration in 1:num.round) {
+        nbatch <- 0
+        train.nll <- 0
+        # reset states
+        init.states <- list()
+        for (name in init.states.name) {
+            init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+        }
+
+        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+
+        tic <- Sys.time()
+
+        train.data$reset()
+
+        while (train.data$iter.next()) {
+            # set rnn input
+            rnn.input <- train.data$value()
+            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
+
+            mx.exec.forward(m$rnn.exec, is.train=TRUE)
+            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
+
+            mx.exec.backward(m$rnn.exec)
+            init.states <- list()
+            for (name in init.states.name) {
+                init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+            }
+
+            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+            # update epoch counter
+            epoch.counter <- epoch.counter + 1
+            if (epoch.counter %% update.period == 0) {
+                # the gradient of initial c and inital h should be zero
+                init.grad <- list()
+                for (name in init.states.name) {
+                    init.grad[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+                }
+
+                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
+
+                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
+
+                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
+
+                grad.arrays <- list()
+                for (name in names(m$rnn.exec$ref.grad.arrays)) {
+                    if (is.param.name(name))
+                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
+                }
+                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
+
+            }
+
+            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size)
+
+            nbatch <- nbatch + seq.len
+            if ((epoch.counter %% log.period) == 0) {
+                cat(paste0("Epoch [", epoch.counter,
+                           "] Train: NLL=", train.nll / nbatch,
+                           ", Perp=", exp(train.nll / nbatch), "\n"))
+            }
+        }
+        train.data$reset()
+        # end of training loop
+        toc <- Sys.time()
+        cat(paste0("Iter [", iteration,
+                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
+                   " sec, NLL=", train.nll / nbatch,
+                   ", Perp=", exp(train.nll / nbatch), "\n"))
+
+        if (!is.null(eval.data)) {
+            val.nll <- 0.0
+            # validation set, reset states
+            init.states <- list()
+            for (name in init.states.name) {
+                init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+            }
+            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+
+            eval.data$reset()
+            nbatch <- 0
+            while (eval.data$iter.next()) {
+                # set rnn input
+                rnn.input <- eval.data$value()
+                mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
+                mx.exec.forward(m$rnn.exec, is.train=FALSE)
+                # probability of each label class, used to evaluate nll
+                seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
+                # transfer the states
+                init.states <- list()
+                for (name in init.states.name) {
+                    init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
+                }
+                mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
+                val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size)
+                nbatch <- nbatch + seq.len
+            }
+            eval.data$reset()
+            perp <- exp(val.nll / nbatch)
+            cat(paste0("Iter [", iteration,
+                       "] Val: NLL=", val.nll / nbatch,
+                       ", Perp=", exp(val.nll / nbatch), "\n"))
+        }
+    }
+
+    return (m)
+}
+
+# check data and translate data into iterator if data is array/matrix
+check.data <- function(data, batch.size, is.train) {
+    if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) {
+        stop("The dataset should be either a mx.io.DataIter or a R list")
+    }
+    if (is.list(data)) {
+        if (is.null(data$data) || is.null(data$label)){
+            stop("Please provide dataset as list(data=R.array, label=R.array)")
+        }
+    data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train)
+    }
+    if (!is.null(data) && !data$iter.next()) {
+        data$reset()
+        if (!data$iter.next()) stop("Empty input")
+    }
+    return (data)
+}
\ No newline at end of file
diff --git a/R-package/man/mx.gru.Rd b/R-package/man/mx.gru.Rd
new file mode 100644
index 000000000000..deca62cfa85a
--- /dev/null
+++ b/R-package/man/mx.gru.Rd
@@ -0,0 +1,66 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gru.R
+\name{mx.gru}
+\alias{mx.gru}
+\title{Training GRU Unrolled Model}
+\usage{
+mx.gru(train.data, eval.data = NULL, num.gru.layer, seq.len, num.hidden,
+  num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(),
+  num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01),
+  dropout = 0, optimizer = "sgd", ...)
+}
+\arguments{
+\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array)
+The Training set.}
+
+\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional
+The validation set used for validation evaluation during the progress.}
+
+\item{num.gru.layer}{integer
+The number of the layer of gru.}
+
+\item{seq.len}{integer
+The length of the input sequence.}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer
+The batch size used for R array training.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{ctx}{mx.context, optional
+The device used to perform training.}
+
+\item{num.round}{integer, default=10
+The number of iterations over training data to train the model.}
+
+\item{update.period}{integer, default=1
+The number of iterations to update parameters during training period.}
+
+\item{initializer}{initializer object. default=mx.init.uniform(0.01)
+The initialization scheme for parameters.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{optimizer}{string, default="sgd"
+The optimization method.}
+
+\item{...}{other parameters passing to \code{mx.gru}/.}
+}
+\value{
+model A trained gru unrolled model.
+}
+\description{
+Training GRU Unrolled Model
+}
+
diff --git a/R-package/man/mx.gru.forward.Rd b/R-package/man/mx.gru.forward.Rd
new file mode 100644
index 000000000000..cedc27bd85a4
--- /dev/null
+++ b/R-package/man/mx.gru.forward.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gru.R
+\name{mx.gru.forward}
+\alias{mx.gru.forward}
+\title{Using forward function to predict in gru inference model}
+\usage{
+mx.gru.forward(model, input.data, new.seq = FALSE)
+}
+\arguments{
+\item{model}{gru model
+A gru inference model}
+
+\item{input.data, }{array.matrix
+The input data for forward function}
+
+\item{new.seq}{boolean, default=FALSE
+Whether the input is the start of a new sequence}
+}
+\value{
+result A list(prob=prob, model=model) containing the result probability of each label and the model.
+}
+\description{
+Using forward function to predict in gru inference model
+}
+
diff --git a/R-package/man/mx.gru.inference.Rd b/R-package/man/mx.gru.inference.Rd
new file mode 100644
index 000000000000..85c66ed8a781
--- /dev/null
+++ b/R-package/man/mx.gru.inference.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/gru.R
+\name{mx.gru.inference}
+\alias{mx.gru.inference}
+\title{Create a GRU Inference Model}
+\usage{
+mx.gru.inference(num.gru.layer, input.size, num.hidden, num.embed, num.label,
+  batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0)
+}
+\arguments{
+\item{num.gru.layer}{integer
+The number of the layer of gru.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer, default=1
+The batch size used for R array training.}
+
+\item{arg.params}{list
+The batch size used for R array training.}
+
+\item{ctx}{mx.context, optional
+Model parameter, list of name to NDArray of net's weights.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+}
+\value{
+model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+     A gru inference model.
+}
+\description{
+Create a GRU Inference Model
+}
+
diff --git a/R-package/man/mx.lstm.inference.Rd b/R-package/man/mx.lstm.inference.Rd
index af572ee28590..19fe3b7fa368 100644
--- a/R-package/man/mx.lstm.inference.Rd
+++ b/R-package/man/mx.lstm.inference.Rd
@@ -23,7 +23,7 @@ The output dim of embedding.}
 \item{num.label}{integer
 The number of labels.}
 
-\item{batch.size}{integer
+\item{batch.size}{integer, default=1
 The batch size used for R array training.}
 
 \item{arg.params}{list
@@ -36,7 +36,8 @@ Model parameter, list of name to NDArray of net's weights.}
 A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
 }
 \value{
-model a lstm inference model.
+model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+     A lstm inference model.
 }
 \description{
 Create a LSTM Inference Model
diff --git a/R-package/man/mx.opt.rmsprop.Rd b/R-package/man/mx.opt.rmsprop.Rd
new file mode 100644
index 000000000000..d51447200f2e
--- /dev/null
+++ b/R-package/man/mx.opt.rmsprop.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/optimizer.R
+\name{mx.opt.rmsprop}
+\alias{mx.opt.rmsprop}
+\title{Create an RMSProp optimizer with respective parameters.
+Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
+The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.}
+\usage{
+mx.opt.rmsprop(learning.rate = 0.002, gamma1 = 0.95, gamma2 = 0.9,
+  wd = 0, rescale.grad = 1, clip_gradient = NULL, lr_scheduler = NULL)
+}
+\arguments{
+\item{learning.rate}{float, default=0.002
+Step size.}
+
+\item{gamma1}{float, default=0.95
+decay factor of moving average for gradient, gradient^2.}
+
+\item{wd}{float, default=0.0
+L2 regularization coefficient add to all the weights.}
+
+\item{rescale.grad}{float, default=1.0
+rescaling factor of gradient.}
+
+\item{clip_gradient}{float, optional
+clip gradient in range [-clip_gradient, clip_gradient].}
+
+\item{lr_scheduler}{function, optional
+The learning rate scheduler.}
+
+\item{gamm2}{float, default=0.9
+"momentum" factor.}
+}
+\description{
+Create an RMSProp optimizer with respective parameters.
+Reference: Tieleman T, Hinton G. Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude[J]. COURSERA: Neural Networks for Machine Learning, 2012, 4(2).
+The code follows: http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+}
+
diff --git a/R-package/man/mx.rnn.Rd b/R-package/man/mx.rnn.Rd
new file mode 100644
index 000000000000..c40915c98275
--- /dev/null
+++ b/R-package/man/mx.rnn.Rd
@@ -0,0 +1,69 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rnn.R
+\name{mx.rnn}
+\alias{mx.rnn}
+\title{Training RNN Unrolled Model}
+\usage{
+mx.rnn(train.data, eval.data = NULL, num.rnn.layer, seq.len, num.hidden,
+  num.embed, num.label, batch.size, input.size, ctx = mx.ctx.default(),
+  num.round = 10, update.period = 1, initializer = mx.init.uniform(0.01),
+  dropout = 0, optimizer = "sgd", batch.norm = FALSE, ...)
+}
+\arguments{
+\item{train.data}{mx.io.DataIter or list(data=R.array, label=R.array)
+The Training set.}
+
+\item{eval.data}{mx.io.DataIter or list(data=R.array, label=R.array), optional
+The validation set used for validation evaluation during the progress.}
+
+\item{num.rnn.layer}{integer
+The number of the layer of rnn.}
+
+\item{seq.len}{integer
+The length of the input sequence.}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer
+The batch size used for R array training.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{ctx}{mx.context, optional
+The device used to perform training.}
+
+\item{num.round}{integer, default=10
+The number of iterations over training data to train the model.}
+
+\item{update.period}{integer, default=1
+The number of iterations to update parameters during training period.}
+
+\item{initializer}{initializer object. default=mx.init.uniform(0.01)
+The initialization scheme for parameters.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{optimizer}{string, default="sgd"
+The optimization method.}
+
+\item{batch.norm}{boolean, default=FALSE
+Whether to use batch normalization.}
+
+\item{...}{other parameters passing to \code{mx.rnn}/.}
+}
+\value{
+model A trained rnn unrolled model.
+}
+\description{
+Training RNN Unrolled Model
+}
+
diff --git a/R-package/man/mx.rnn.forward.Rd b/R-package/man/mx.rnn.forward.Rd
new file mode 100644
index 000000000000..c8763b6c1ad7
--- /dev/null
+++ b/R-package/man/mx.rnn.forward.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rnn.R
+\name{mx.rnn.forward}
+\alias{mx.rnn.forward}
+\title{Using forward function to predict in rnn inference model}
+\usage{
+mx.rnn.forward(model, input.data, new.seq = FALSE)
+}
+\arguments{
+\item{model}{rnn model
+A rnn inference model}
+
+\item{input.data, }{array.matrix
+The input data for forward function}
+
+\item{new.seq}{boolean, default=FALSE
+Whether the input is the start of a new sequence}
+}
+\value{
+result A list(prob=prob, model=model) containing the result probability of each label and the model.
+}
+\description{
+Using forward function to predict in rnn inference model
+}
+
diff --git a/R-package/man/mx.rnn.inference.Rd b/R-package/man/mx.rnn.inference.Rd
new file mode 100644
index 000000000000..56e00e62f620
--- /dev/null
+++ b/R-package/man/mx.rnn.inference.Rd
@@ -0,0 +1,49 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rnn.R
+\name{mx.rnn.inference}
+\alias{mx.rnn.inference}
+\title{Create a RNN Inference Model}
+\usage{
+mx.rnn.inference(num.rnn.layer, input.size, num.hidden, num.embed, num.label,
+  batch.size = 1, arg.params, ctx = mx.cpu(), dropout = 0,
+  batch.norm = FALSE)
+}
+\arguments{
+\item{num.rnn.layer}{integer
+The number of the layer of rnn.}
+
+\item{input.size}{integer
+The input dim of one-hot encoding of embedding}
+
+\item{num.hidden}{integer
+The number of hidden nodes.}
+
+\item{num.embed}{integer
+The output dim of embedding.}
+
+\item{num.label}{integer
+The number of labels.}
+
+\item{batch.size}{integer, default=1
+The batch size used for R array training.}
+
+\item{arg.params}{list
+The batch size used for R array training.}
+
+\item{ctx}{mx.context, optional
+Model parameter, list of name to NDArray of net's weights.}
+
+\item{dropout}{float, default=0
+A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.}
+
+\item{batch.norm}{boolean, default=FALSE
+Whether to use batch normalization.}
+}
+\value{
+model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
+     A rnn inference model.
+}
+\description{
+Create a RNN Inference Model
+}
+
diff --git a/R-package/vignettes/CallbackFunctionTutorial.Rmd b/R-package/vignettes/CallbackFunctionTutorial.Rmd
index 85cd78be90b0..97b6ce3161a0 100644
--- a/R-package/vignettes/CallbackFunctionTutorial.Rmd
+++ b/R-package/vignettes/CallbackFunctionTutorial.Rmd
@@ -6,7 +6,7 @@ which can very useful in model training.
 
 This tutorial is written in Rmarkdown.
 
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/CallbackFunctionTutorial.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/CallbackFunctionTutorial.html)
 
 - You can find the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CallbackFunctionTutorial.Rmd)
 
diff --git a/R-package/vignettes/CharRnnModel.Rmd b/R-package/vignettes/CharRnnModel.Rmd
index 9066d60f7513..2cb4b00ec1ac 100644
--- a/R-package/vignettes/CharRnnModel.Rmd
+++ b/R-package/vignettes/CharRnnModel.Rmd
@@ -3,12 +3,12 @@ Char RNN Example
 
 This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
 
-Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
+Data can be found at [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare)
 
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/CharRnnModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd)
 
 Load Data 
@@ -21,10 +21,10 @@ Set basic network parameters.
 ```{r}
 batch.size = 32
 seq.len = 32
-num.hidden = 256
-num.embed = 256
-num.lstm.layer = 2
-num.round = 3
+num.hidden = 16
+num.embed = 16
+num.lstm.layer = 1
+num.round = 1
 learning.rate= 0.1
 wd=0.00001
 clip_gradient=1
@@ -161,33 +161,7 @@ model <- mx.lstm(X.train, X.val,
                  clip_gradient=clip_gradient)
 
 ```
-Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result.
-```
-Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657
-...
-Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355
-Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858
-Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419
-Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329
-...
-Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351
-Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367
-Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988
-...
-Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974
-Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519
-Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576
-Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767
-...
-Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296
-Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454
-Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604
-...
-Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281
-Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428
-Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187
 
-```
 Inference from model
 --------------------
 helper function for random sample.
@@ -225,15 +199,12 @@ choice <- function(weights) {
 ```
 we can use random output or fixed output by choosing largest probability.
 ```{r}
-make.output <- function(prob, sample=FALSE, temperature=1.) {
+make.output <- function(prob, sample=FALSE) {
     if (!sample) {
         idx <- which.max(as.array(prob))
     }
     else {
-        scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6)
-        rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature)
-        rescale <- rescale / (as.array(mx.nd.sum(rescale))[1])
-        idx <- choice(rescale)
+        idx <- choice(prob)
     }
     return (idx)
 
@@ -252,7 +223,7 @@ infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                  ctx=mx.cpu())
 ```
 generate a sequence of 75 chars using function `mx.lstm.forward`.
-```
+```{r}
 start <- 'a'
 seq.len <- 75
 random.sample <- TRUE
@@ -273,4 +244,10 @@ The result:
 ```
 ah not a drobl greens
 Settled asing lately sistering sounted to their hight
-```
\ No newline at end of file
+```
+
+Other RNN models
+----------------
+In `mxnet`, other RNN models like custom RNN and gru is also provided.
+- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
+- For **GRU model**, you can replace `mx.lstm` with `mx.gru` to train gru model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to inference from gru model and get forward result from the inference model.
\ No newline at end of file
diff --git a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
index 32fdafd38145..6b58946eaa31 100644
--- a/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
+++ b/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd
@@ -7,13 +7,13 @@ algorithm can do is to classify real world images.
 In this example we will show how to use a pretrained Inception-BatchNorm Network to predict the class of
 real world image. The network architecture is decribed in [1].
 
-The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)
+The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://data.dmlc.ml/mxnet/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
 
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/classifyRealImageWithPretrainedModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/classifyRealImageWithPretrainedModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
 
 Pacakge Loading
@@ -69,7 +69,7 @@ preproc.image <- function(im, mean.image) {
   shape <- dim(im)
   short.edge <- min(shape[1:2])
   xx <- floor((shape[1] - short.edge) / 2)
-  yy <- floor((shape[2] - short.edge) / 2) 
+  yy <- floor((shape[2] - short.edge) / 2)
   croped <- crop.borders(im, xx, yy)
   # resize to 224 x 224, needed by input of the model.
   resized <- resize(croped, 224, 224)
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
index efb0dba98109..66ac18ef3806 100644
--- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
+++ b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
@@ -8,7 +8,7 @@ We will show you how to do classification and regression tasks respectively. The
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/fiveMinutesNeuralNetwork.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/fiveMinutesNeuralNetwork.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd)
 
 ## Classification
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index a81613b4a59e..6387b4ba1694 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -5,7 +5,7 @@ Handwritten Digits Classification Competition
 We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
-hosted version of tutorial [here](http://mxnet.readthedocs.org/en/latest/R-package/mnistCompetition.html).
+hosted version of tutorial [here](http://mxnet.readthedocs.io/en/latest/packages/r/mnistCompetition.html).
 
 ## Data Loading
 
diff --git a/amalgamation/Makefile b/amalgamation/Makefile
index 1111305b5bba..c23210a1a8b5 100644
--- a/amalgamation/Makefile
+++ b/amalgamation/Makefile
@@ -1,14 +1,14 @@
 export MXNET_ROOT=`pwd`/..
 # Change this to path of openblas
-export OPENBLAS_ROOT=`pwd`/OpenBLAS
+export OPENBLAS_ROOT=${MXNET_ROOT}/../OpenBLAS
 
 # Whether use minimum build without blas and SSE, this will make the library super slow
 ifndef MIN
-	export MIN= 0
+	export MIN=0
 endif
 
 ifndef ANDROID
-        export ANDROID=0
+    export ANDROID=0
 endif
 
 
@@ -17,8 +17,8 @@ endif
 
 CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall
 ifneq ($(MIN), 1)
-	CFLAGS+= -I${OPENBLAS_ROOT}/include
-	LDFLAGS+=-L${OPENBLAS_ROOT}/lib -lopenblas
+	CFLAGS += -I${OPENBLAS_ROOT} -I${OPENBLAS_ROOT}/include
+	LDFLAGS+= -L${OPENBLAS_ROOT} -L${OPENBLAS_ROOT}/lib -lopenblas
 endif
 
 
@@ -68,4 +68,4 @@ ${MXNET_ROOT}/lib/libmxnet_predict.so:  mxnet_predict-all.o
 	ls -alh $@
 
 clean:
-	rm -f *.d *.o
+	rm -f *.d *.o *.so *.a mxnet_predict-all.cc
diff --git a/amalgamation/README.md b/amalgamation/README.md
index f96a11d7ed57..9d4e3fe9c8a3 100644
--- a/amalgamation/README.md
+++ b/amalgamation/README.md
@@ -24,7 +24,7 @@ This module is created by [Jack Deng](https://github.com/jdeng).
 
 Android
 ---------------
-Setup NDK and build your standalone toolchain. [Instructions](http://developer.android.com/ndk/guides/standalone_toolchain.html#itc) Use the Advanced Method!!! In particular set PATH, CC and CXX.
+Setup NDK and build your standalone toolchain. [Instructions](http://developer.android.com/ndk/guides/standalone_toolchain.html#itc) Use the Advanced Method!!! In particular set PATH, CC and CXX. The minimum API level required is 16.
 
 Example:
 ```
@@ -33,7 +33,7 @@ export CC=arm-linux-androideabi-gcc   # or export CC=arm-linux-androideabi-clang
 export CXX=arm-linux-androideabi-g++  # or export CXX=arm-linux-androideabi-clang++
 ```
 
-Build OpenBlas for Android: [Build OpenBlas](https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android)
+Build OpenBLAS for Android: [Build OpenBLAS](https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android) Please put OpenBLAS source code outside mxnet directory.
 Modify OPENBLAS_ROOT in Makefile
 Type ```make ANDROID=1```
 
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 9016db58535b..be854180ceb1 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -30,14 +30,15 @@ def get_sources(def_file):
     sources = []
     files = []
     visited = set()
+    mxnet_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
     for line in open(def_file):
         files = files + line.strip().split(' ')
 
     for f in files:
         f = f.strip()
-        if not f or f.endswith('.o') or f == '\\': continue
+        if not f or f.endswith('.o:') or f == '\\': continue
         fn = os.path.relpath(f)
-        if fn.find('/usr/') < 0 and fn not in visited:
+        if os.path.abspath(f).startswith(mxnet_path) and fn not in visited:
             sources.append(fn)
             visited.add(fn)
     return sources
diff --git a/dmlc-core b/dmlc-core
index 9fd3b48462a7..755f577a38cf 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 9fd3b48462a7a651e12a197679f71e043dcb25a2
+Subproject commit 755f577a38cf3aa07f38a2667ffc583d22195e52
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
index 10311cb31bf8..1e5a956450dc 100644
--- a/docker/cpu/Dockerfile
+++ b/docker/cpu/Dockerfile
@@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y build-essential git libopenblas-dev lib
 RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
     cp make/config.mk . && \
     echo "USE_BLAS=openblas" >>config.mk && \
-    make -j8
+    make -j$(nproc)
 
 # python pakcage
 RUN apt-get install -y python-numpy wget unzip
diff --git a/docker/cuda/Dockerfile b/docker/cuda/Dockerfile
index 8796b70aa1c9..fff84352bf16 100644
--- a/docker/cuda/Dockerfile
+++ b/docker/cuda/Dockerfile
@@ -9,7 +9,7 @@ RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
     echo "USE_CUDA_PATH=/usr/local/cuda" >>config.mk && \
     echo "USE_CUDNN=1" >>config.mk && \
     echo "USE_BLAS=openblas" >>config.mk && \
-    make -j8 ADD_LDFLAGS=-L/usr/local/cuda/lib64/stubs
+    make -j$(nproc) ADD_LDFLAGS=-L/usr/local/cuda/lib64/stubs
 ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH 
 
 # python pakcage
diff --git a/docs/_static/js/auto_module_index.js b/docs/_static/js/auto_module_index.js
index b918ecdc1635..e0238ed391f8 100644
--- a/docs/_static/js/auto_module_index.js
+++ b/docs/_static/js/auto_module_index.js
@@ -21,5 +21,4 @@ function auto_index(module) {
     html += "</ul>";
     li_node.append(html);
   });
-}
-
+}
\ No newline at end of file
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index e0898a8b0567..a0901e42783d 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -1,8 +1,12 @@
-<div class="splash">
+<div id="splash">
   <div class="container">
     <div class="row">
       <div class="col-lg-12">
-        <h1>Flexible and Efficient Library for Deep Learning</h1>
+        <div id="banner-title">Flexible and Efficient Library for Deep Learning</div>
+        <div id="get_start">
+          <a href="get_started/" id="get_start_btn">Get Started</a>
+        </div>
+        <!-- end of get started button -->
         <div id="social">
           <span>
             <iframe src="https://ghbtns.com/github-btn.html?user=dmlc&repo=mxnet&type=star&count=true&v=2"
@@ -10,10 +14,8 @@ <h1>Flexible and Efficient Library for Deep Learning</h1>
             <iframe src="https://ghbtns.com/github-btn.html?user=dmlc&repo=mxnet&type=fork&count=true&v=2"
                     frameborder="0" scrolling="0" width="100px" height="20px"></iframe>
           </span>
-        </div> <!-- end of social -->
-        <div class="get_start">
-          <a href="get_started/" class="get_start_btn">Get Started</a>
-        </div> <!-- end of get started button -->
+        </div>
+        <!-- end of social -->
       </div>
     </div>
   </div>
diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index 62ebf399e936..1f444c640dbf 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -56,7 +56,7 @@
       };
     </script>
 
-    {% for name in ['jquery.js', 'underscore.js', 'doctools.js', 'searchtools.js'] %}
+    {% for name in ['jquery.js', 'underscore.js', 'doctools.js', 'searchtools.js', 'selectlang.js'] %}
     <script type="text/javascript" src="{{ pathto('_static/' + name, 1) }}"></script>
     {% endfor %}
 
@@ -76,7 +76,7 @@
     <link rel="stylesheet" href="{{ pathto('_static/pygments.css', 1) }}" type="text/css" />
     {%- endif %}
 
-    <link rel="stylesheet" href="{{ pathto('_static/mxnet.css', 1) }}">
+    <link rel="stylesheet" href="{{ pathto('_static/mxnet.css', 1) }}" type="text/css">
 {%- endmacro %}
 
 <html lang="en">
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index 587665f7912c..127cb40b2045 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -1,3 +1,4 @@
+<!-- Previous Navbar Layout
 <div class="navbar navbar-default navbar-fixed-top">
   <div class="container">
     <div class="navbar-header">
@@ -35,6 +36,60 @@
         <li> <a href="{{url_root}}index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
         <li> <a href="{{url_root}}/zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
       </ul>
-    </div> <!-- navbar -->
+    </div>
+  </div>
+</div>
+Previous Navbar Layout End -->
+
+<div class="navbar navbar-fixed-top">
+  <div class="container">
+    <div id="header-inner" class="innder">
+      <h1 id="logo-wrap">
+        <a href="{{ url_root }}" id="logo"><img src="http://data.dmlc.ml/theme/mxnet.png"></a>
+      </h1>
+      <nav id="main-nav">
+        {% for name in ['Get Started', 'Tutorials', 'How To'] %}
+        <a class="main-nav-link" href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a>
+        {% endfor %}
+
+        {% for name in ['Packages'] %}
+        <span id="dropdown-menu-position-anchor">
+          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
+          <ul id="package-dropdown-menu" class="dropdown-menu">
+            {% for lang in ['Python', 'R', 'Julia', 'C++', 'Scala'] %}
+            <li><a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/{{lang.lower()}}/index.html">
+                {{lang}}
+            </a></li>
+            {% endfor %}
+          </ul>
+          {% endfor %}
+        </span>
+        <a class="main-nav-link" href="{{url_root}}system/index.html">System</a>
+        <div id="search-input-wrap">
+          <div id="search-input-icon">
+            <i class="fa fa-search"></i>
+          </div>
+          <input type="search" id="search-input" placeholder="Search...">
+          {{searchform('', False)}}
+        </div>
+
+      </nav>
+      <script> function getRootPath(){ return "{{url_root}}" } </script>
+      <div id="lang-select-wrap"> 
+        <label id="lang-select-label">
+          <!-- <i class="fa fa-globe"></i> -->
+          <span></span>
+        </label>
+        <select id="lang-select">
+          <option value="en">English</option>
+          <option value="zh">简体中文</option>
+        </select>
+      </div>
+  <!--     <a id="mobile-nav-toggle">
+        <span class="mobile-nav-toggle-bar"></span>
+        <span class="mobile-nav-toggle-bar"></span>
+        <span class="mobile-nav-toggle-bar"></span>
+      </a> -->
+    </div>
   </div>
 </div>
diff --git a/docs/_static/mxnet.css b/docs/_static/mxnet.css
index f4862a706b9b..83de570c5683 100644
--- a/docs/_static/mxnet.css
+++ b/docs/_static/mxnet.css
@@ -1,6 +1,300 @@
+/* basic style */
+a, abbr, acronym, address, applet, big, blockquote, body, caption, cite, code, dd, del, dfn, div, dl, dt, em, fieldset, form, h1, h2, h3, h4, h5, h6, html, iframe, img, ins, kbd, label, legend, li, object, ol, p, q, s, samp, small, span, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, ul, var {
+    margin: 0;
+    padding: 0;
+    border: 0;
+    outline: 0;
+    font-weight: inherit;
+    font-style: inherit;
+    font-family: inherit;
+    font-size: 100 %;
+    vertical-align: baseline
+}
+
+body {
+    background: #fff;
+    color: #000;
+    font-family: Lato, Helvetica, "Helvetica Neue", Arial, sans-serif;
+    font-size: 16px;
+    font-weight: 400;
+    line-height: 1.6;
+    text-rendering: optimizeLegibility;
+    -webkit-font-smoothing: antialiased;
+    -moz-osx-font-smoothing: grayscale;
+}
+
+ol,
+ul {
+    list-style: none
+}
+
+table {
+    border-collapse: separate;
+    border-spacing: 0
+}
+
+caption,
+table,
+td,
+th {
+    vertical-align: middle
+}
+
+caption,
+td,
+th {
+    text-align: left;
+    font-weight: 400
+}
+
+a:hover, 
+a:focus,
+a:active {
+    text-decoration: none;
+}
+
+a img {
+    border: none
+}
+
+
+
+html {
+    box-sizing: border-box;
+}
+
+*,
+: after,
+: before {
+    box-sizing: inherit
+}
+
+button::-moz-focus-inner,
+input[type=button]::-moz-focus-inner,
+input[type=reset]::-moz-focus-inner,
+input[type=submit]::-moz-focus-inner {
+    padding: 0;
+    margin: 0;
+    border: 0
+}
+
+button,
+input,
+select {
+    margin: 0;
+    padding: 0;
+    border: 0
+}
+
+@media screen {
+    body,
+    html {
+        height: 100 %;
+    }
+}
+/* basic end */
+
+/*** code style ***/
+/*code block style*/
+.highlight {
+    border-radius: 4px;
+}
+
+pre {
+    border: 0;
+    line-height: 1.6;
+    margin: 0 0 16px;
+    padding: 10px 16px;
+    word-break: break-all;
+    word-wrap: break-word;
+}
+
+/*code inline style*/
+code.docutils, code.literal {
+    padding: 3px 5px;
+}
+/*** code style end ***/
+
+body > .container {
+    padding-top: 80px
+}
+
 /* header section */
-.splash{
-    padding:5em 0 1em 0;
+/* navbar */
+.navbar {
+    background-color:#0079b2;
+    opacity: 0.9;
+    border: 0px;
+    height: 60px;
+    padding: 0 80px;
+    margin-bottom: 0px;
+}
+
+#header-inner {
+    display: -webkit-box;
+    display: -webkit-flex;
+    display: -ms-flexbox;
+    display: box;
+    display: flex;
+    -webkit-box-orient: horizontal;
+    -moz-box-orient: horizontal;
+    -webkit-box-lines: single;
+    -moz-box-lines: single;
+    -webkit-flex-flow: row nowrap;
+    -ms-flex-flow: row nowrap;
+    flex-flow: row;
+    -webkit-box-align: center;
+    -ms-flex-align: center;
+    -webkit-align-items: center;
+    align-items: center
+}
+
+@media screen and(max-width: 768 px) {
+    #header-inner {
+        -webkit-box-pack: center;
+       -ms-flex-pack: center;
+       -webkit-justify-content: center;
+        justify-content: center
+    }
+}
+
+#logo-wrap {
+    -webkit-box-flex: 1;
+    box-flex: 1;
+   -webkit-flex: 0 50 px;
+   -ms-flex: 0 50 px;
+    flex: 0 50 px
+}
+
+#logo {
+    width: 150px;
+    display: block;
+    float: left;
+    height: 60px;
+    padding: 10px 0 0 0;
+}
+
+#logo > img {
+  display: block;
+  width: 110px;
+}
+
+#main-nav {
+    display: none;
+    -webkit-box-flex: 1;
+    box-flex: 1;
+    -webkit-flex: 1 auto;
+    -ms-flex: 1 auto;
+    flex: 1 auto;
+}
+
+@media screen and (min-width:769px) {
+    #main-nav {
+        display: block
+    }
+}
+
+.main-nav-link {
+    color: #fff;
+    text-decoration: none;
+    line-height: 50px;
+    opacity: .7;
+    -webkit-transition: .2s;
+    transition: .2s;
+    font-family: Lato, "Helvetica Neue", Helvetica, Arial, sans-serif;
+    display: inline-block;
+    padding: 0 15px
+}
+
+.main-nav-link:hover {
+    opacity: 1;
+    color: #1094e8;
+    text-decoration: none;
+}
+
+#dropdown-menu-position-anchor {
+    position: relative;
+}
+
+#package-dropdown-menu {
+    top: 36px;
+    border-radius: 4px; 
+    padding: 0;
+}
+
+#package-dropdown-menu > li > a {
+    color: #0079b2;
+    padding: 6px 16px;
+
+}
+
+#search-input-wrap {
+    display: none;
+    padding-left: 6px;
+    padding-bottom: 8px;
+    border-bottom: 1px solid #999
+}
+
+#search-input-icon,
+#search-input-wrap.on {
+    display: inline-block
+}
+
+#search-input-icon {
+    color: #fff;
+    padding-right: .5em;
+    opacity: .7
+}
+
+#search-input {
+    background: none;
+    font-size: inherit;
+    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+    color: #fff;
+    outline: none;
+    -webkit-appearance: none
+}
+
+#lang-select-wrap {
+    display: none;
+    position: relative
+}
+
+@media screen and (min-width:769px) {
+    #lang-select-wrap {
+        display: block
+    }
+}
+
+#lang-select-label {
+    color: #fff;
+    opacity: .7;
+    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+    line-height: 50px
+}
+
+#lang-select-label span {
+    padding-left: 8px
+}
+
+#lang-select-label i {
+    opacity: .7
+}
+
+#lang-select {
+    opacity: 0;
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 100%;
+    height: 100%;
+    -webkit-appearance: menulist-button;
+    font-size: inherit
+}
+
+/* banner */
+#splash{
+    padding:60px 0 0 0;
     background-color:#0079b2;
     /* background-image:url(../img/bg.jpg); */
     background-size:cover;
@@ -9,27 +303,59 @@
     text-align:center
 }
 
-.splash h1{
+#splash #banner {
+    text-align: center
+}
+
+#splash #banner-title {
+    padding: 20px 0 10px 0;
+    font-size: 40px;
+    line-height: 1.15;
+    font-weight: 300;
+    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+}
+
+@media screen and (min-width:769px) {
+    #splash #banner-title {
+        padding-top: 100px;
+    }
+}
+
+#splash h1{
     font-size: 40px;
     margin-bottom: 20px;
 }
-.splash .social{
-    margin:2em 0
+
+#splash #social{
+    margin:2em 0 4em 0;
 }
 
-.splash .get_start {
+#splash #get_start {
     margin:2em 0
 }
 
-.splash .get_start_btn {
-  border: 2px solid #FFFFFF;
-  border-radius: 5px;
-  color: #FFFFFF;
-  display: inline-block;
-  font-size: 26px;
-  padding: 9px 20px;
+#splash #get_start_btn {
+    border: 1.8px solid #FFFFFF;
+    border-radius: 2px;
+    color: #FFFFFF;
+    display: inline-block;
+    font-size: 22px;
+    font-family: Helvetica, Helvetica Neue, Arial, sans-serif;
+    padding: 8px 20px;
+    -webkit-transition: .2s;
+    transition: .2s;
+}
+
+#splash #get_start_btn:hover {
+    background-color: #FFFFFF;
+    color: #0079b2;
+    opacity: 0.9;
 }
 
+
+
+
+
 .section-tout{
     padding:3em 0 3em;
     border-bottom:1px solid rgba(0,0,0,.05);
@@ -112,45 +438,11 @@ div.sphinxsidebar ul ul { margin-left: 15px }
     padding-right: 15px
 }
 
-body>.container {
-    padding-top: 80px
-}
 
-body {
-    font-size: 16px;
-}
 
-pre {
-    font-size: 14px;
-}
 
-/* navbar */
-.navbar {
-    background-color:#0079b2;
-    border: 0px;
-    height: 65px;
-}
-.navbar-right li {
-    display:inline-block;
-    vertical-align:top;
-    padding: 22px 4px;
-}
-
-.navbar-left li {
-    display:inline-block;
-    vertical-align:top;
-    padding: 17px 10px;
-    /* margin: 0 5px; */
-}
+/*embed end */
 
-.navbar-left li a {
-    font-size: 22px;
-    color: #fff;
-}
-
-.navbar-left > li > a:hover{
-    color:#fff;
-}
 .flag-icon {
   background-size: contain;
   background-position: 50%;
@@ -179,18 +471,7 @@ pre {
 /*    padding: 10px; */
 /* } */
 
-.navbar-brand >img {
-    width: 110px;
-}
 
-.dropdown-menu li  {
-    padding: 0px 0px;
-    width: 120px;
-}
-.dropdown-menu li a {
-    color: #0079b2;
-    font-size: 20px;
-}
 
 .section h1 {
     padding-top: 90px;
diff --git a/docs/_static/selectlang.js b/docs/_static/selectlang.js
new file mode 100644
index 000000000000..25337abcb22b
--- /dev/null
+++ b/docs/_static/selectlang.js
@@ -0,0 +1,25 @@
+function changeLanguage(langSelect, langSelectLabel, rootpath){
+	langSelect.change(function() {
+		var lang = langSelect.val();
+		if(lang == 'zh'){
+			location.href = rootpath + 'zh/index.html';
+		} else {
+			location.href = rootpath + 'index.html';	
+		}
+	});
+}
+
+$(document).ready(function () {
+	var langSelect = $("#lang-select");
+	var langSelectLabel = $("#lang-select-label > span");
+	currHref = location.href;
+	
+	if(/\/zh\//.test(currHref)){
+		langSelect.val("zh");
+	} else {
+		langSelect.val("en");
+	}
+	langSelectLabel.text($("option:selected").text());
+
+	changeLanguage(langSelect, langSelectLabel, getRootPath());
+})
\ No newline at end of file
diff --git a/docs/how_to/build.md b/docs/how_to/build.md
index 458493c70672..4ef8ac5420b7 100644
--- a/docs/how_to/build.md
+++ b/docs/how_to/build.md
@@ -59,7 +59,7 @@ sudo apt-get install -y build-essential git libatlas-base-dev libopencv-dev
 Then build mxnet
 ```bash
 git clone --recursive https://github.com/dmlc/mxnet
-cd mxnet; make -j4
+cd mxnet; make -j$(nproc)
 ```
 
 ### Building on OSX
@@ -77,9 +77,21 @@ Then build mxnet
 
 ```bash
 git clone --recursive https://github.com/dmlc/mxnet
-cd mxnet; cp make/osx.mk ./config.mk; make -j4
+cd mxnet; cp make/osx.mk ./config.mk; make -j$(sysctl -n hw.ncpu)
 ```
 
+Or use cmake command and Xcode
+
+```bash
+mkdir build; cd build
+cmake -G Xcode -DCMAKE_BUILD_TYPE=Release -DCMAKE_CONFIGURATION_TYPES="Release" -DUSE_OPENMP="OFF" -DUSE_CUDNN="OFF" -DUSE_CUDA="OFF" -DBLAS=MKL ..
+```
+
+Then open `mxnet.xcodeproj` by xcode and change two flags in `Build Settings` before building:
+(1) Link-Time Optimization = Yes
+(2) Optimisation Level = Fasteset[-O3]
+
+
 Troubleshooting:
 
 Some of the users might meet the link error `ld: library not found for -lgomp`, indicating that the GNU implementation of OpenMP is not in the library path of operating system.
@@ -95,7 +107,7 @@ ln -s path1 /usr/local/lib/libgomp.dylib
 
 ```
 
-then run `make -j4` again.
+then run `make -j$(sysctl -n hw.ncpu)` again.
 
 
 ### Building on Windows
@@ -231,6 +243,37 @@ Now you should have the R package as a tar.gz file and you can install it as a n
 R CMD INSTALL mxnet_0.5.tar.gz
 ```
 
+If you can't load `mxnet` after enabling CUDA during the installation. Please add following lines into `$RHOME/etc/ldpaths`. You can find your `$RHOME` by using `R.home()` inside R.
+
+```bash
+export CUDA_HOME=/usr/local/cuda 
+export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+```
+
+To install the package using GPU on Windows without building the package from scratch. Note that you need a couple of programs installed already:  
+- You'll need the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit). This depends on Visual Studio, and a free compatible version would be [Visual Studio Community 2013](https://www.visualstudio.com/en-us/news/vs2013-community-vs.aspx). For instructions and compatibility checks, read http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/ .
+
+- You will also need to register as a developer at nvidia and download CUDNN V3, https://developer.nvidia.com/cudnn . 
+
+
+1. Download the mxnet package as a ZIP from the Github repository https://github.com/dmlc/mxnet and unpack it. You will be editing the `/mxnet/R-package` folder.
+
+2. Download the most recent GPU-enabled package from the [Releases tab](https://github.com/dmlc/mxnet/releases). Unzip this file so you have a folder `/nocudnn`. Note that this file and the folder you'll save it in will be used for future reference and not directly for installing the package. Only some files will be copied from it into the `R-package` folder.
+
+(Note: you now have 2 folders we're working with, possibly in different locations, that we'll reference with `R-package/` and `nocudnn/`.)
+
+3. Download CUDNN V3 from https://developer.nvidia.com/cudnn. Unpack the .zip file and you'll see 3 folders, `/bin`, `/include`, `/lib`. Copy and replace these 3 folders into `nocudnn/3rdparty/cudnn/`, or unpack the .zip file there directly.
+
+4. Create the folder `R-package/inst/libs/x64`. We only support 64-bit operating system now, so you need the x64 folder;
+
+5. Put dll files in `R-package/inst/libs/x64`. 
+
+The first dll file you need is `nocudnn/lib/libmxnet.dll`. The other dll files you need are the ones in all 4 subfolders of `nocudnn/3rdparty/`, for the `cudnn` and `openblas` you'll need to look in the `/bin` folders. There should be 11 dll files now in `R-package/inst/libs/x64`.
+
+6. Copy the folder `nocudnn/include/` to `R-package/inst/`. So now you should have a folder `R-package/inst/include/` with 3 subfolders.
+
+7. Run `R CMD INSTALL --no-multiarch R-package`. Make sure that R is added to your PATH in Environment Variables. Running the command `Where R` in Command Prompt should return the location.
+
 Note on Library Build:
 
 We isolate the library build with Rcpp end to maximize the portability
diff --git a/docs/how_to/cloud.md b/docs/how_to/cloud.md
index 1f8bfe9907be..26cda6ab8cc0 100644
--- a/docs/how_to/cloud.md
+++ b/docs/how_to/cloud.md
@@ -29,7 +29,7 @@ There are several ways to upload local data to S3. One simple way is using
 [s3cmd](http://s3tools.org/s3cmd). For example:
 
 ```bash
-wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip
+wget http://data.dmlc.ml/mxnet/data/mnist.zip
 unzip mnist.zip && s3cmd put t*-ubyte s3://dmlc/mnist/
 ```
 
@@ -72,7 +72,7 @@ echo "USE_CUDNN=1" >>config.mk
 echo "USE_BLAS=atlas" >> config.mk
 echo "USE_DIST_KVSTORE = 1" >>config.mk
 echo "USE_S3=1" >>config.mk
-make -j8
+make -j$(nproc)
 ```
 
 To test whether everything is installed properly, we train a Convolutional neural network on MNIST using a GPU:
diff --git a/docs/how_to/env_var.md b/docs/how_to/env_var.md
index c63ba7a12a53..d15e11386bde 100644
--- a/docs/how_to/env_var.md
+++ b/docs/how_to/env_var.md
@@ -31,6 +31,9 @@ Usually you do not need to change these settings, but they are listed here for r
 * MXNET_KVSTORE_BIGARRAY_BOUND (default=1e6)
 	- The minimum size of "big array".
 	- When the array size is bigger than this threshold, MXNET_KVSTORE_REDUCTION_NTHREADS threads will be used for reduction.
+* MXNET_CUDNN_AUTOTUNE_DEFAULT (default=0)
+    - The default value of cudnn_tune for convolution layers.
+    - Auto tuning is turn off by default. Set to 1 to turn on by default for benchmarking.
 
 Settings for Minimum Memory Usage
 ---------------------------------
diff --git a/docs/how_to/multi_devices.md b/docs/how_to/multi_devices.md
index a6611ad971bb..95ad8f8ba120 100644
--- a/docs/how_to/multi_devices.md
+++ b/docs/how_to/multi_devices.md
@@ -140,7 +140,7 @@ start a job by using `ssh`, `mpi`, `sge`, or `yarn`.
 
 Assume we are at the directory `mxnet/example/image-classification`.  and want
 to train mnist with lenet by using
-[train_mnist.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_mnist.py]).
+[train_mnist.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_mnist.py).
 On a single machine  we can run by
 
 ```bash
diff --git a/docs/packages/python/index.md b/docs/packages/python/index.md
index a9f3a0f2bac5..aa22ebcd2dce 100644
--- a/docs/packages/python/index.md
+++ b/docs/packages/python/index.md
@@ -1,7 +1,7 @@
 MXNet Python Package
 ====================
 This page contains links to all the python related documents on python package.
-To install the package package, checkout [Build and Installation Instruction](../../how_to/build.md).
+To install the python package, checkout [Build and Installation Instruction](../../how_to/build.md).
 There are three types of documents you can find about mxnet.
 
 * [Tutorials](#tutorials) are self contained materials that introduces a certain use-cases of mxnet.
diff --git a/docs/packages/r/CallbackFunctionTutorial.md b/docs/packages/r/CallbackFunctionTutorial.md
index c27e009fea7c..c60833a75bcd 100644
--- a/docs/packages/r/CallbackFunctionTutorial.md
+++ b/docs/packages/r/CallbackFunctionTutorial.md
@@ -6,7 +6,7 @@ which can very useful in model training.
 
 This tutorial is written in Rmarkdown.
 
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/CallbackFunctionTutorial.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/CallbackFunctionTutorial.html)
 
 - You can find the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CallbackFunctionTutorial.Rmd)
 
diff --git a/docs/packages/r/CharRnnModel.Rmd b/docs/packages/r/CharRnnModel.md
similarity index 65%
rename from docs/packages/r/CharRnnModel.Rmd
rename to docs/packages/r/CharRnnModel.md
index 9066d60f7513..201301c7981b 100644
--- a/docs/packages/r/CharRnnModel.Rmd
+++ b/docs/packages/r/CharRnnModel.md
@@ -3,35 +3,46 @@ Char RNN Example
 
 This example aims to show how to use lstm model to build a char level language model, and generate text from it. We use a tiny shakespeare text for demo purpose.
 
-Data can be found at https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare. 
+Data can be found at [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare) 
 
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/package/r/CharRnnModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/packages/r/CharRnnModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/CharRnnModel.Rmd)
 
 Load Data 
 ---------
 First of all, load in the data and preprocess it.
-```{r}
+
+```r
 require(mxnet)
 ```
+
+```
+## Loading required package: mxnet
+```
+
+```
+## Loading required package: methods
+```
 Set basic network parameters.
-```{r}
+
+```r
 batch.size = 32
 seq.len = 32
-num.hidden = 256
-num.embed = 256
-num.lstm.layer = 2
-num.round = 3
+num.hidden = 16
+num.embed = 16
+num.lstm.layer = 1
+num.round = 1
 learning.rate= 0.1
 wd=0.00001
 clip_gradient=1
 update.period = 1
 ```
 download the data.
-```{r}
+
+```r
 download.data <- function(data_dir) {
     dir.create(data_dir, showWarnings = FALSE)
     if (!file.exists(paste0(data_dir,'input.txt'))) {
@@ -41,7 +52,8 @@ download.data <- function(data_dir) {
 }
 ```
 Make dictionary from text.
-```{r}
+
+```r
 make.dict <- function(text, max.vocab=10000) {
     text <- strsplit(text, '')
     dic <- list()
@@ -59,7 +71,8 @@ make.dict <- function(text, max.vocab=10000) {
 }
 ```
 Transfer text into data feature.
-```{r}
+
+```r
 make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
     fi <- file(file.path, "r")
     text <- paste(readLines(fi), collapse="\n")
@@ -92,7 +105,8 @@ make.data <- function(file.path, seq.len=32, max.vocab=10000, dic=NULL) {
 }
 ```
 Move tail text.
-```{r}
+
+```r
 drop.tail <- function(X, batch.size) {
     shape <- dim(X)
     nstep <- as.integer(shape[2] / batch.size)
@@ -100,7 +114,8 @@ drop.tail <- function(X, batch.size) {
 }
 ```
 get the label of X
-```{r}
+
+```r
 get.label <- function(X) {
     label <- array(0, dim=dim(X))
     d <- dim(X)[1]
@@ -114,9 +129,17 @@ get.label <- function(X) {
 }
 ```
 get training data and eval data
-```{r}
+
+```r
 download.data("./data/")
 ret <- make.data("./data/input.txt", seq.len=seq.len)
+```
+
+```
+## Total unique char: 65
+```
+
+```r
 X <- ret$data
 dic <- ret$dic
 lookup.table <- ret$lookup.table
@@ -143,7 +166,8 @@ Training Model
 --------------
 In `mxnet`, we have a function called `mx.lstm` so that users can build a general lstm model. 
 
-```{r}
+
+```r
 model <- mx.lstm(X.train, X.val, 
                  ctx=mx.cpu(),
                  num.round=num.round, 
@@ -159,39 +183,49 @@ model <- mx.lstm(X.train, X.val,
                  learning.rate=learning.rate,
                  wd=wd,
                  clip_gradient=clip_gradient)
+```
 
 ```
-Setting the parameters ctx=mx.gpu(0) and num.round=5 can get the following result.
+## Epoch [31] Train: NLL=3.53787130224343, Perp=34.3936275728271
+## Epoch [62] Train: NLL=3.43087958036949, Perp=30.903813186055
+## Epoch [93] Train: NLL=3.39771238228587, Perp=29.8956319855751
+## Epoch [124] Train: NLL=3.37581711716687, Perp=29.2481732041015
+## Epoch [155] Train: NLL=3.34523331338447, Perp=28.3671933405139
+## Epoch [186] Train: NLL=3.30756356274787, Perp=27.31848454823
+## Epoch [217] Train: NLL=3.25642968403829, Perp=25.9566978956055
+## Epoch [248] Train: NLL=3.19825967486207, Perp=24.4898727477925
+## Epoch [279] Train: NLL=3.14013971549828, Perp=23.1070950525017
+## Epoch [310] Train: NLL=3.08747601837462, Perp=21.9216781782189
+## Epoch [341] Train: NLL=3.04015595674863, Perp=20.9085038031042
+## Epoch [372] Train: NLL=2.99839339255659, Perp=20.0532932584534
+## Epoch [403] Train: NLL=2.95940091012609, Perp=19.2864139984503
+## Epoch [434] Train: NLL=2.92603311380224, Perp=18.6534872738302
+## Epoch [465] Train: NLL=2.89482756896395, Perp=18.0803835531869
+## Epoch [496] Train: NLL=2.86668230478397, Perp=17.5786009078994
+## Epoch [527] Train: NLL=2.84089368534943, Perp=17.1310684830416
+## Epoch [558] Train: NLL=2.81725862932279, Perp=16.7309220880514
+## Epoch [589] Train: NLL=2.79518870141492, Perp=16.3657166956952
+## Epoch [620] Train: NLL=2.77445683225304, Perp=16.0299176962855
+## Epoch [651] Train: NLL=2.75490970113174, Perp=15.719621374694
+## Epoch [682] Train: NLL=2.73697900634351, Perp=15.4402696117257
+## Epoch [713] Train: NLL=2.72059739336781, Perp=15.1893935780915
+## Epoch [744] Train: NLL=2.70462837571585, Perp=14.948760335793
+## Epoch [775] Train: NLL=2.68909904683828, Perp=14.7184093476224
+## Epoch [806] Train: NLL=2.67460054451836, Perp=14.5065539595711
+## Epoch [837] Train: NLL=2.66078997776751, Perp=14.3075873113043
+## Epoch [868] Train: NLL=2.6476781639279, Perp=14.1212134100373
+## Epoch [899] Train: NLL=2.63529039846876, Perp=13.9473621677371
+## Epoch [930] Train: NLL=2.62367693518974, Perp=13.7863219168709
+## Epoch [961] Train: NLL=2.61238282674384, Perp=13.6314936713501
+## Iter [1] Train: Time: 10301.6818172932 sec, NLL=2.60536539345356, Perp=13.5361704272949
+## Iter [1] Val: NLL=2.26093848746227, Perp=9.59208699731232
 ```
-Epoch [31] Train: NLL=3.47213018872144, Perp=32.2052727363657
-...
-Epoch [961] Train: NLL=2.32060007657895, Perp=10.181782322355
-Iter [1] Train: Time: 186.397065639496 sec, NLL=2.31135356537961, Perp=10.0880702804858
-Iter [1] Val: NLL=1.94184484060012, Perp=6.97160060607419
-Epoch [992] Train: NLL=1.84784553299322, Perp=6.34613225095329
-...
-Epoch [1953] Train: NLL=1.70175791172558, Perp=5.48357857093351
-Iter [2] Train: Time: 188.929051160812 sec, NLL=1.70103940328978, Perp=5.47963998859367
-Iter [2] Val: NLL=1.74979316010449, Perp=5.75341251767988
-...
-Epoch [2914] Train: NLL=1.54738185300295, Perp=4.69915099483974
-Iter [3] Train: Time: 185.425321578979 sec, NLL=1.54604189517013, Perp=4.69285854740519
-Iter [3] Val: NLL=1.67780240235925, Perp=5.35377758479576
-Epoch [2945] Train: NLL=1.48868466087876, Perp=4.43126307034767
-...
-Iter [4] Train: Time: 185.487086296082 sec, NLL=1.4744973925858, Perp=4.36883940994296
-Iter [4] Val: NLL=1.64488167325603, Perp=5.18039689118454
-Epoch [3937] Train: NLL=1.46355541021581, Perp=4.32129622881604
-...
-Epoch [4898] Train: NLL=1.42900458455642, Perp=4.17454171976281
-Iter [5] Train: Time: 185.070136785507 sec, NLL=1.42909226256273, Perp=4.17490775130428
-Iter [5] Val: NLL=1.62716655804022, Perp=5.08943365437187
 
-```
 Inference from model
 --------------------
 helper function for random sample.
-```{r}
+
+```r
 cdf <- function(weights) {
     total <- sum(weights)
     result <- c()
@@ -224,16 +258,14 @@ choice <- function(weights) {
 }
 ```
 we can use random output or fixed output by choosing largest probability.
-```{r}
-make.output <- function(prob, sample=FALSE, temperature=1.) {
+
+```r
+make.output <- function(prob, sample=FALSE) {
     if (!sample) {
         idx <- which.max(as.array(prob))
     }
     else {
-        scale_prob <- mx.nd.clip(prob, 1e-6, 1 - 1e-6)
-        rescale <- mx.nd.exp(mx.nd.log(scale_prob) / temperature)
-        rescale <- rescale / (as.array(mx.nd.sum(rescale))[1])
-        idx <- choice(rescale)
+        idx <- choice(prob)
     }
     return (idx)
 
@@ -242,7 +274,8 @@ make.output <- function(prob, sample=FALSE, temperature=1.) {
 
 In `mxnet`, we have a function called `mx.lstm.inference` so that users can build a inference from lstm model and then use function `mx.lstm.forward` to get forward output from the inference.
 Build inference from model.
-```{r}
+
+```r
 infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                  input.size=vocab,
                                  num.hidden=num.hidden,
@@ -252,7 +285,7 @@ infer.model <- mx.lstm.inference(num.lstm.layer=num.lstm.layer,
                                  ctx=mx.cpu())
 ```
 generate a sequence of 75 chars using function `mx.lstm.forward`.
-```
+```r
 start <- 'a'
 seq.len <- 75
 random.sample <- TRUE
@@ -273,4 +306,10 @@ The result:
 ```
 ah not a drobl greens
 Settled asing lately sistering sounted to their hight
-```
\ No newline at end of file
+```
+
+Other RNN models
+----------------
+In `mxnet`, other RNN models like custom RNN and gru is also provided.
+- For **custom RNN model**, you can replace `mx.lstm` with `mx.rnn` to train rnn model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.rnn.inference` and `mx.rnn.forward` to inference from rnn model and get forward result from the inference model.
+- For **GRU model**, you can replace `mx.lstm` with `mx.gru` to train gru model. Also, you can replace `mx.lstm.inference` and `mx.lstm.forward` with `mx.gru.inference` and `mx.gru.forward` to inference from gru model and get forward result from the inference model.
diff --git a/docs/packages/r/classifyRealImageWithPretrainedModel.md b/docs/packages/r/classifyRealImageWithPretrainedModel.md
index 6d73bb409bce..7bc5fec1a08f 100644
--- a/docs/packages/r/classifyRealImageWithPretrainedModel.md
+++ b/docs/packages/r/classifyRealImageWithPretrainedModel.md
@@ -6,13 +6,13 @@ algorithm can do is to classify real world images.
 In this example we will show how to use a pretrained Inception-BatchNorm Network to predict the class of
 real world image. The network architecture is decribed in [1].
 
-The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)
+The pre-trained Inception-BatchNorm network is able to be downloaded from [this link](http://data.dmlc.ml/mxnet/data/Inception.zip)
 This model gives the recent state-of-art prediction accuracy on image net dataset.
 
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/classifyRealImageWithPretrainedModel.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/classifyRealImageWithPretrainedModel.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/classifyRealImageWithPretrainedModel.Rmd)
 
 Pacakge Loading
@@ -112,7 +112,7 @@ preproc.image <- function(im, mean.image) {
   shape <- dim(im)
   short.edge <- min(shape[1:2])
   xx <- floor((shape[1] - short.edge) / 2)
-  yy <- floor((shape[2] - short.edge) / 2) 
+  yy <- floor((shape[2] - short.edge) / 2)
   croped <- crop.borders(im, xx, yy)
   # resize to 224 x 224, needed by input of the model.
   resized <- resize(croped, 224, 224)
diff --git a/docs/packages/r/fiveMinutesNeuralNetwork.md b/docs/packages/r/fiveMinutesNeuralNetwork.md
index 1d56c7984d7d..6102eaee7569 100644
--- a/docs/packages/r/fiveMinutesNeuralNetwork.md
+++ b/docs/packages/r/fiveMinutesNeuralNetwork.md
@@ -8,7 +8,7 @@ We will show you how to do classification and regression tasks respectively. The
 Preface
 -------
 This tutorial is written in Rmarkdown.
-- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.org/en/latest/R-package/fiveMinutesNeuralNetwork.html)
+- You can directly view the hosted version of the tutorial from [MXNet R Document](http://mxnet.readthedocs.io/en/latest/packages/r/fiveMinutesNeuralNetwork.html)
 - You can find the download the Rmarkdown source from [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd)
 
 ## Classification
diff --git a/docs/packages/r/index.md b/docs/packages/r/index.md
index ef427abc4899..829ca1d995e7 100644
--- a/docs/packages/r/index.md
+++ b/docs/packages/r/index.md
@@ -20,7 +20,7 @@ Tutorials
 * [Handwritten Digits Classification Competition](mnistCompetition.md)
 * [Tutorial on NDArray and Symbol](ndarrayAndSymbolTutorial.md)
 * [Tutorial on Callback Functions](CallbackFunctionTutorial.md)
-* [Character Language Model using RNN Model](CharRnnModel.Rmd)
+* [Character Language Model using RNN Model](CharRnnModel.md)
 
 Resources
 ---------
diff --git a/docs/packages/r/mnistCompetition.md b/docs/packages/r/mnistCompetition.md
index 4a0a0d71f854..a84ecb5ec326 100644
--- a/docs/packages/r/mnistCompetition.md
+++ b/docs/packages/r/mnistCompetition.md
@@ -5,7 +5,7 @@ Handwritten Digits Classification Competition
 We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
 
 This tutorial is written in Rmarkdown. You can download the source [here](https://github.com/dmlc/mxnet/blob/master/R-package/vignettes/mnistCompetition.Rmd) and view a
-hosted version of tutorial [here](http://mxnet.readthedocs.org/en/latest/R-package/mnistCompetition.html).
+hosted version of tutorial [here](http://mxnet.readthedocs.io/en/latest/packages/r/mnistCompetition.html).
 
 ## Data Loading
 
diff --git a/docs/tutorials/imagenet_full.md b/docs/tutorials/imagenet_full.md
index ba6b26648cdf..f0e722ed975d 100644
--- a/docs/tutorials/imagenet_full.md
+++ b/docs/tutorials/imagenet_full.md
@@ -68,7 +68,7 @@ We should note that this result is by no means optimal, as we did not carefully
 ## The Code and Model
 The code and step guide is publically available at [https://github.com/dmlc/mxnet/tree/master/example/image-classification](https://github.com/dmlc/mxnet/tree/master/example/image-classification)
 
-We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception)
+We also release a pretrained model under [https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception](https://github.com/dmlc/mxnet-model-gallery/tree/master/imagenet-21k-inception.md)
 
 ## How to Use The Model
 We should point out it 21k classes is much more challenging than 1k. Directly use the raw prediction is not a reasonable way.
diff --git a/docs/zh/packages/python/index.md b/docs/zh/packages/python/index.md
new file mode 100644
index 000000000000..0a539eb6bb36
--- /dev/null
+++ b/docs/zh/packages/python/index.md
@@ -0,0 +1,26 @@
+MXNet Python Package
+====================
+
+这个页面包含 python 程序包中所有相关的文档.
+为了安装 python 程序包, 请 checkout [Build and Installation Instruction](../../how_to/build.md).
+
+这里有关于 mxnet 的三种文档.
+
+* [Tutorials](#tutorials)  介绍一个特定的关于 mxnet 的用例.
+* [Code Examples](../../../example) 示例代码.
+* [Python API Documents](#python-api-documents) 关于指定模块的文档, 同时也包含所有 API 的参考文档.
+
+Tutorials
+---------
+* [Python Overview Tutorial](tutorial.md)
+* [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md)
+* [How to Create New Operations (Layers)](../../how_to/new_op.md)
+
+Python API Documents
+--------------------
+* [High Level Model Training Related API](model.md)
+* [The Module API](module.md)
+* [NDArray API](ndarray.md)
+* [Symbolic API](symbol.md)
+* [KVStore API](kvstore.md)
+* [Data Loading API](io.md)
diff --git a/docs/zh/packages/python/io.md b/docs/zh/packages/python/io.md
new file mode 100644
index 000000000000..08165d84ed0d
--- /dev/null
+++ b/docs/zh/packages/python/io.md
@@ -0,0 +1,185 @@
+MXNet Python Data Loading API
+=============================
+* [Introduction](#introduction) 介绍 MXNet 数据加载模块的主要特性.
+* [Parameters For Data Iterator](#parameters-for-data-iterator) 阐述清楚 dataIter 的参数的不同用法.
+* [Create A Data Iterator](#create-a-data-iterator) 介绍如何在创建一个  python 版本的 MXNet 的 Data Iterator.
+* [How To Get Data](#how-to-get-data) 介绍数据源以及数据预处理工具.
+* [IO API Reference](#io-api-reference) IO API 参考文档以及它们的解释.
+
+Introduction
+------------
+这页面介绍 MXNet 的数据输入方式. MXNet 使用迭代器 (iterator)的方式向神经网络输入数据. 迭代器做了一些数据预处理, 同时以 batch 的形式向神经网络提供数据.
+
+
+* 我们为 MNIST 图像和 RecordIO 图像提供了基本的迭代器.
+* 为了掩盖 IO 开销, 我们提供了预处理策略, 它可以让机器学习的过程和取数据的过程并行来做. 我们使用一个单独的线程来做取数据的工作.
+
+Parameters For Data Iterator
+----------------------------
+
+一般地讲, 如果你要创建一个数据迭代器, 你需要实现下面讲到的五种参数:
+
+* **Dataset Param** 提供数据集的基本信息, 比如说, 文件路径, 输入的数据的 shape. 
+* **Batch Param** 提供构建一个 batch 的信息,  比如说 batch size. 
+* **Augmentation Param** 指定输入数据的扩充方式 (e.g. crop, mirror).
+* **Backend Param** 控制后端线程掩盖数据加载开销的行为.
+* **Auxiliary Param** 提供的可选项, 用来帮助检查和 debug..
+
+通常地讲, **Dataset Param** 和 **Batch Param**  *必须* 提 供, 否则 data batch 无法创建. 其他的参数根据算法和性能的需要来设置.  文档的后半部分会提供解释详尽的例子.
+
+Create A Data Iterator
+----------------------
+这个 IO API 提供在 python 中创建数据迭代器的简单方式. 下面的代码是如何创建一个 Cifar 的数据迭代器的代码.
+
+
+```python
+>>>dataiter = mx.io.ImageRecordIter(
+>>>        # Utility Parameter 
+>>>        # 可选
+>>>        # Name of the data, should match the name of the data input of the network 
+>>>        # data_name='data',
+>>>        # Utility Parameter
+>>>        # 可选
+>>>        # Name of the label, should match the name of the label parameter of the network.
+>>>        # Usually, if the loss layer is named 'foo', then the label input has the name
+>>>        # 'foo_label', unless overwritten
+>>>        # label_name='softmax_label',
+>>>        # Dataset Parameter
+>>>        # Impulsary
+>>>        # indicating the data file, please check the data is already there
+>>>        path_imgrec="data/cifar/train.rec",
+>>>        # Dataset Parameter
+>>>        # Impulsary
+>>>        # indicating the image size after preprocessing
+>>>        data_shape=(3,28,28),
+>>>        # Batch Parameter
+>>>        # Impulsary
+>>>        # tells how many images in a batch
+>>>        batch_size=100,
+>>>        # Augmentation Parameter
+>>>        # 可选
+>>>        # when offers mean_img, each image will substract the mean value at each pixel
+>>>        mean_img="data/cifar/cifar10_mean.bin",
+>>>        # Augmentation Parameter
+>>>        # 可选
+>>>        # randomly crop a patch of the data_shape from the original image
+>>>        rand_crop=True,
+>>>        # Augmentation Parameter
+>>>        # Optional
+>>>        # randomly mirror the image horizontally
+>>>        rand_mirror=True,
+>>>        # Augmentation Parameter
+>>>        # Optional
+>>>        # randomly shuffle the data
+>>>        shuffle=False,
+>>>        # Backend Parameter
+>>>        # Optional
+>>>        # Preprocessing thread number
+>>>        preprocess_threads=4,
+>>>        # Backend Parameter
+>>>        # Optional
+>>>        # Prefetch buffer size
+>>>        prefetch_buffer=1)
+```
+
+从上面的代码中, 我们可以学到如何创建一个数据迭代器. 首先, 你需要明确的指出需要取哪种类型的数据(MNIST, ImageRecord 等等). 然后, 提供描述数据的可选参数, 比如 batching, 数据扩充方式, 多线程处理, 预取数据.  MNNet 框架会检查参数的有效性, 如果一个必须的参数没有提供, 框架会报错.
+
+
+How To Get Data
+---------------
+
+
+我们提供了 [脚本](../../tests/python/common/get_data.py) 来下载MNIST数据 和Cifar10 ImageRecord 数据.  如果你要创建你自己的数据集, 我们建议您用RecordIO 作为数据格式.
+
+## Create Dataset Using RecordIO
+
+RecordIO 实现了顺序存储 record 的数据格式. 我们建议图像数据按照 record 的格式来存储和打包到一起. 这样做的有以下几点:
+
+
+* 将图像储存为压缩过的格式, 比如 JPEG, 因为 record 可以大小不同. 压缩过的格式可以极大的减小储存在硬盘上的数据集大小.
+* 将若干 record 打包存储, 可以实现硬盘的连续读取, 避免随机读取硬盘.
+* RecordIO 容易分块, 这样分布式处理的设置会更加简单. 后面会有例子具体来说明.
+
+我们提供了 [im2rec tool](../../tools/im2rec.cc) 来让用户自己来生成 RecordIO 格式的数据集.  下面是具体流程:
+
+### 0.Before you start
+确定你已经下载了需要的数据集. 你不需要自己来做图像的 resize 操作, 现在 `im2rec` 这个工具可以自动来做这种操作. 你可以查看 `im2rec` 提供的的信息来获取更多的内容.
+
+### 1.Make the image list
+当你得到了信息之后, 你首先需要生成一个 image list 的文件. 格式如下
+```
+integer_image_index \t label_index \t path_to_image
+```
+通常, 这个程序会读取一个包含所有图像文件名的列表文件,  shuffe 这些文件, 然后将 shuffe 后的图像文件名列表分为训练列表文件和测试列表文件. 按照下面给出的例子的格式存储.
+
+简单的例子文件
+
+```bash
+895099  464     n04467665_17283.JPEG
+10025081        412     ILSVRC2010_val_00025082.JPEG
+74181   789     n01915811_2739.JPEG
+10035553        859     ILSVRC2010_val_00035554.JPEG
+10048727        929     ILSVRC2010_val_00048728.JPEG
+94028   924     n01980166_4956.JPEG
+1080682 650     n11807979_571.JPEG
+972457  633     n07723039_1627.JPEG
+7534    11      n01630670_4486.JPEG
+1191261 249     n12407079_5106.JPEG
+```
+
+### 2.Make the binary file
+
+需要用 *im2rec* 这个程序来生成二进制文件.  im2rec 需要你刚刚生成的 _ image list file _ 的路径, 图像的 _root_ 路径 和 _output file_ 路径作为参数. 这个过程需要花费几个小时, 所以需要耐心. :)
+
+
+简单的例子:
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256
+```
+要想获得更多的用法, 直接运行 ```./bin/im2rec```命令, 会在终端打印出详细的用法.
+
+### Extension: Mutliple Labels for a Single Image
+
+`im2rec` 工具以及 `mx.io.ImageRecordIter` 支持对单个图像打多个标签. 假设你需要为单个图像打四个标签, 你可以按照下面的步骤来使用 RecordIO 相关的工具.
+
+1. 按照下面的格式生成 image list 文件:
+```
+integer_image_index \t label_1 \t label_2 \t label_3 \t label_4 \t path_to_image
+```
+
+2. 使用 `im2rec` 时, 需要增加一个 'label_width=4' 作为命令行参数, 比如.
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
+```
+
+3. 在你的迭代器初始化的时候, 设置 `label_width=4` 和 `path_imglist=<<The PATH TO YOUR image.lst>>` 作为参数.
+
+```python
+dataiter = mx.io.ImageRecordIter(
+  path_imgrec="data/cifar/train.rec",
+  data_shape=(3,28,28),
+  path_imglist="data/cifar/image.lst",
+  label_width=4
+)
+```
+
+这样你就完成了一个多标签的数据迭代器.
+
+```eval_rst
+.. raw:: html
+
+    <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+```
+
+
+IO API Reference
+----------------
+
+```eval_rst
+.. automodule:: mxnet.io
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.io");</script>
+```
diff --git a/docs/zh/packages/python/kvstore.md b/docs/zh/packages/python/kvstore.md
new file mode 100644
index 000000000000..5d694036d548
--- /dev/null
+++ b/docs/zh/packages/python/kvstore.md
@@ -0,0 +1,133 @@
+KVStore API
+===========
+
+* [基本的 Push 和 Pull 操作](#basic-push-and-pull)
+* [key-value pairs 列表的接口](#interface-for-list-key-value-pairs)
+* [多机]() TODO
+
+## Basic Push and Pull
+
+单机多卡的基本操作.
+
+### Initialization
+
+首先让我们来考虑一个简单的例子. 首先初始化一个 (`int`, `NDAarray`) push 到 KVstore 里, 然后再将数据   pull 下来.
+
+```python
+>>> kv = mx.kv.create('local') # create a local kv store.
+>>> shape = (2,3)
+>>> kv.init(3, mx.nd.ones(shape)*2)
+>>> a = mx.nd.zeros(shape)
+>>> kv.pull(3, out = a)
+>>> print a.asnumpy()
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+
+### Push, Aggregation, and Updater
+
+对于任意一个被初始化的 key-value 数据, 我们可以向这个 `key` push 一个相同 shape 的数据覆盖掉原来的 value.
+
+
+```python
+>>> kv.push(3, mx.nd.ones(shape)*8)
+>>> kv.pull(3, out = a) # pull out the value
+>>> print a.asnumpy()
+[[ 8.  8.  8.]
+ [ 8.  8.  8.]]
+```
+
+需要做 push 操作的数据可以存储在任意的设备上. 而且, 我们可以向同一个 key 推送多份数据, KVStore 客户端会首先将这些数据做 sum 操作, 然后将聚合后的结果 push 到服务器端, 减少了数据通信.
+
+```python
+>>> gpus = [mx.gpu(i) for i in range(4)]
+>>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
+>>> kv.push(3, b)
+>>> kv.pull(3, out = a)
+>>> print a.asnumpy()
+[[ 4.  4.  4.]
+ [ 4.  4.  4.]]
+```
+
+对于每一个 push 操作, KVStore 将推送上来的数据通过 `updater` 定义的方式来进行更新操作. 默认的 `updater` 是 `ASSGIN`, 我们可以根据需要来替换掉这个默认的 `update`.
+
+```python
+>>> def update(key, input, stored):
+>>>     print "update on key: %d" % key
+>>>     stored += input * 2
+>>> kv._set_updater(update)
+>>> kv.pull(3, out=a)
+>>> print a.asnumpy()
+[[ 4.  4.  4.]
+ [ 4.  4.  4.]]
+>>> kv.push(3, mx.nd.ones(shape))
+update on key: 3
+>>> kv.pull(3, out=a)
+>>> print a.asnumpy()
+[[ 6.  6.  6.]
+ [ 6.  6.  6.]]
+```
+
+### Pull
+
+我们已经看到如何 pull 单个的 key-value 对. 类似于 push, 我们也能只用一个调用来将数据 pull 到多个设备中.
+
+```python
+>>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
+>>> kv.pull(3, out = b)
+>>> print b[1].asnumpy()
+[[ 6.  6.  6.]
+ [ 6.  6.  6.]]
+```
+
+## Interface for list key-value pairs
+
+我们到现在为止所介绍的所有操作都是关于一个 key. KVStore 也提供了对 key-value pair 列表的接口. 
+
+针对单个的设备:
+
+```python
+>>> keys = [5, 7, 9]
+>>> kv.init(keys, [mx.nd.ones(shape)]*len(keys))
+>>> kv.push(keys, [mx.nd.ones(shape)]*len(keys))
+update on key: 5
+update on key: 7
+update on key: 9
+>>> b = [mx.nd.zeros(shape)]*len(keys)
+>>> kv.pull(keys, out = b)
+>>> print b[1].asnumpy()
+[[ 3.  3.  3.]
+ [ 3.  3.  3.]]
+```
+
+针对多个设备:
+
+```python
+>>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
+>>> kv.push(keys, b)
+update on key: 5
+update on key: 7
+update on key: 9
+>>> kv.pull(keys, out = b)
+>>> print b[1][1].asnumpy()
+[[ 11.  11.  11.]
+ [ 11.  11.  11.]]
+```
+
+```eval_rst
+.. raw:: html
+
+    <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+```
+
+
+## API Reference
+
+```eval_rst
+.. automodule:: mxnet.kvstore
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.kvstore");</script>
+```
diff --git a/docs/zh/packages/python/ndarray.md b/docs/zh/packages/python/ndarray.md
new file mode 100644
index 000000000000..3b2fad0a199c
--- /dev/null
+++ b/docs/zh/packages/python/ndarray.md
@@ -0,0 +1,161 @@
+NDArray API
+===========
+
+NDArray 程序包 (`mxnet.ndarray`) 包含类似于 `numpy.ndarray` 的 张量计算包.  它的语法很相近, 除了增加了一些处理 I/O 和多设备的调用.
+
+Create NDArray
+--------------
+
+类似 `numpy`, 你可以按照下面的方式来创建 `mxnet.ndarray` :
+```python
+>>> import mxnet as mx
+>>> # all-zero array of dimension 100x50
+>>> a = mx.nd.zeros((100, 50))
+>>> # all-one array of dimension 256x32x128x1
+>>> b = mx.nd.ones((256, 32, 128, 1))
+>>> # initialize array with contents
+>>> c = mx.nd.array([[1, 2, 3], [4, 5, 6]])
+```
+
+NDArray operations
+-------------------
+
+我们提供了几个基本的 ndarray 操作, 比如说算术和切片. 更多的操作正在开发中!
+
+### 算术操作
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.zeros((100, 50))
+>>> a.shape
+(100L, 50L)
+>>> b = mx.nd.ones((100, 50))
+>>> # c and d will be calculated in parallel here!
+>>> c = a + b
+>>> d = a - b
+>>> # inplace operation, b's contents will be modified, but c and d won't be affected.
+>>> b += d
+```
+
+### 切片操作
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.zeros((100, 50))
+>>> a[0:10] = 1   # first 10 rows will become 1
+```
+
+Conversion from/to `numpy.ndarray`
+----------------------------------
+
+MXNet NDArray 提供了很自然的方式来支持`mxnet.ndarray` 和 `numpy.ndarray` 之间的互相转换:
+
+```python
+>>> import mxnet as mx
+>>> import numpy as np
+>>> a = np.array([1,2,3])
+>>> b = mx.nd.array(a)                  # convert from numpy array
+>>> b
+<mxnet.ndarray.NDArray object at ...>
+>>> b.asnumpy()                         # convert to numpy array
+array([ 1., 2., 3.], dtype=float32)
+```
+
+Save Load NDArray
+-----------------
+
+你可以一种使用 pickle 来保存和加载 NDArray.
+我们也提供了一些函数来简化 NDArray 的列表或者字典的加载与保存操作.
+
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.zeros((100, 200))
+>>> b = mx.nd.zeros((100, 200))
+>>> # save list of NDArrays
+>>> mx.nd.save("/path/to/array/file", [a, b])
+>>> # save dictionary of NDArrays to AWS S3
+>>> mx.nd.save("s3://path/to/s3/array", {'A' : a, 'B' : b})
+>>> # save list of NDArrays to hdfs.
+>>> mx.nd.save("hdfs://path/to/hdfs/array", [a, b])
+>>> from_file = mx.nd.load("/path/to/array/file")
+>>> from_s3 = mx.nd.load("s3://path/to/s3/array")
+>>> from_hdfs = mx.nd.load("hdfs://path/to/hdfs/array")
+```
+
+使用 `save` 和 `load` 的好的一方面是:
+- 你可以在所有的 `mxnet` 的其他编程语言的绑定中相同的接口.
+- 已经支持 S3 和 HDFS
+
+Multi-device Support
+--------------------
+设备信息是存储在 `mxnet.Context` 数据结构中. 当我们在 mxnet 中创建 ndarray 的时候, 我们要么使用上下文参数(默认是 CPU 上下文) 在指定的设备上创建, 或者按照下面的例子中的方式使用 `with` 表达式:
+
+```python
+>>> import mxnet as mx
+>>> cpu_a = mx.nd.zeros((100, 200))
+>>> cpu_a.context
+cpu(0)
+>>> with mx.Context(mx.gpu(0)):
+>>>   gpu_a = mx.nd.ones((100, 200))
+>>> gpu_a.context
+gpu(0)
+>>> ctx = mx.Context(mx.gpu(0))
+>>> gpu_b = mx.nd.zeros((100, 200), ctx)
+>>> gpu_b.context
+gpu(0)
+```
+
+现在我们还 *不支持* 涉及不同上下文环境中的多个 ndarray 的操作. 为了支持这种情况下的操作, 我们首先使用 `copyto` 方法将不同的上下文环境中的 ndarray 拷贝到同一个上下文环境中, 然后执行相应的操作:
+
+```python
+>>> import mxnet as mx
+>>> x = mx.nd.zeros((100, 200))
+>>> with mx.Context(mx.gpu(0)):
+>>>   y = mx.nd.zeros((100, 200))
+>>> z = x + y
+mxnet.base.MXNetError: [13:29:12] src/ndarray/ndarray.cc:33: Check failed: lhs.ctx() == rhs.ctx() operands context mismatch
+>>> cpu_y = mx.nd.zeros((100, 200))
+>>> y.copyto(cpu_y)
+>>> z = x + cpu_y
+```
+
+```eval_rst
+.. raw:: html
+
+    <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+```
+
+NDArray API Reference
+---------------------
+
+```eval_rst
+.. automodule:: mxnet.ndarray
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.ndarray");</script>
+```
+
+NDArray Random API Reference
+----------------------------
+
+```eval_rst
+.. automodule:: mxnet.random
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.random");</script>
+```
+
+
+Context API Reference
+---------------------
+
+```eval_rst
+.. automodule:: mxnet.context
+    :members:
+
+.. raw:: html
+
+    <script>auto_index("mxnet.context");</script>
+```
diff --git a/docs/zh/system/engine.md b/docs/zh/system/dep_engine.md
similarity index 100%
rename from docs/zh/system/engine.md
rename to docs/zh/system/dep_engine.md
diff --git a/docs/zh/system/index.md b/docs/zh/system/index.md
index 2798b9531daa..31811ca34547 100644
--- a/docs/zh/system/index.md
+++ b/docs/zh/system/index.md
@@ -21,7 +21,7 @@
 
 上面显示的是 mxnet 的主要的模块以及它们之间如何进行交互. 这些模块是
 
-- [运行时依赖引擎](engine.md): 根据操作的读写依赖关系来调度和执行这些操作.
+- [运行时依赖引擎](dep_engine.md): 根据操作的读写依赖关系来调度和执行这些操作.
 - Storage Allocator: 可以高效的申请内存和重复利用内存, 包括 CPU 的主存和 GPU 的显存.
 - Resource Manager: 管理全局资源, 包括 随机数产生器以及临时空间.
 - NDArray: 动态的,异步的n维数组, 为MXNet 提供命令式编程模型.
diff --git a/docs/zh/system/note_data_loading.md b/docs/zh/system/note_data_loading.md
index a48040df98d3..738a6e23eb44 100644
--- a/docs/zh/system/note_data_loading.md
+++ b/docs/zh/system/note_data_loading.md
@@ -113,7 +113,7 @@ InputSplit 需要下面的几个参数:
 
 ### Hide IO Cost Using Threadediter
 
-掩藏 IO 开销的一种方式是主线程在做 feed-forward 和 backward 的时候, 使用一个独立的现成做数据预取操作. 为了支持更加复杂的训练方案, MXNet 提供了基于 dmlc-core 的 threadediter 更加通用的 IO 处理流水线.
+掩藏 IO 开销的一种方式是主线程在做 feed-forward 和 backward 的时候, 使用一个独立的线程做数据预取操作. 为了支持更加复杂的训练方案, MXNet 提供了基于 dmlc-core 的 threadediter 更加通用的 IO 处理流水线.
 
 Threadediter 的重点是使用一个独立的线程作为数据提供者, 主线程作为数据消费者, 图示如下.
 
diff --git a/example/cnn_text_classification/text_cnn.py b/example/cnn_text_classification/text_cnn.py
index c944ec5c9270..4ce48a94ac77 100644
--- a/example/cnn_text_classification/text_cnn.py
+++ b/example/cnn_text_classification/text_cnn.py
@@ -13,20 +13,28 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__) # get a logger to accuracies are printed
 
+logs = sys.stderr
+
 CNNModel = namedtuple("CNNModel", ['cnn_exec', 'symbol', 'data', 'label', 'param_blocks'])
 
-def make_text_cnn(sentence_size, num_embed, batch_size, num_label=2, filter_list=[3, 4, 5], num_filter=100, dropout=0.):
+def make_text_cnn(sentence_size, num_embed, batch_size, vocab_size,
+        num_label=2, filter_list=[3, 4, 5], num_filter=100,
+        dropout=0., with_embedding=True):
+
     input_x = mx.sym.Variable('data') # placeholder for input
     input_y = mx.sym.Variable('softmax_label') # placeholder for output
 
     # embedding layer
-    # embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
-    # embed_layer = mx.sym.Reshape(data=embed_layer, target_shape=(1, 1, sentence_size, num_embed))
+    if not with_embedding:
+        embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
+        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
+    else:
+        conv_input = input_x
 
     # create convolution + (max) pooling layer for each filter operation
     pooled_outputs = []
     for i, filter_size in enumerate(filter_list):
-        convi = mx.sym.Convolution(data=input_x, kernel=(filter_size, num_embed), num_filter=num_filter)
+        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
         relui = mx.sym.Activation(data=convi, act_type='relu')
         pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
         pooled_outputs.append(pooli)
@@ -54,12 +62,18 @@ def make_text_cnn(sentence_size, num_embed, batch_size, num_label=2, filter_list
     return sm
 
 
-def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, dropout=0.5, initializer=mx.initializer.Uniform(0.1)):
-    cnn = make_text_cnn(sentence_size, num_embed, batch_size=batch_size, dropout=dropout)
+def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, vocab_size,
+        dropout=0.5, initializer=mx.initializer.Uniform(0.1), with_embedding=True):
+
+    cnn = make_text_cnn(sentence_size, num_embed, batch_size=batch_size,
+            vocab_size=vocab_size, dropout=dropout, with_embedding=with_embedding)
     arg_names = cnn.list_arguments()
 
     input_shapes = {}
-    input_shapes['data'] = (batch_size, 1, sentence_size, num_embed)
+    if with_embedding:
+        input_shapes['data'] = (batch_size, 1, sentence_size, num_embed)
+    else:
+        input_shapes['data'] = (batch_size, sentence_size)
 
     arg_shape, out_shape, aux_shape = cnn.infer_shape(**input_shapes)
     arg_arrays = [mx.nd.zeros(s, ctx) for s in arg_shape]
@@ -88,7 +102,8 @@ def setup_cnn_model(ctx, batch_size, sentence_size, num_embed, dropout=0.5, init
     return CNNModel(cnn_exec=cnn_exec, symbol=cnn, data=data, label=label, param_blocks=param_blocks)
 
 
-def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, batch_size, optimizer='rmsprop', max_grad_norm=5.0, learning_rate=0.001, epoch=200):
+def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, batch_size,
+        optimizer='rmsprop', max_grad_norm=5.0, learning_rate=0.0005, epoch=200):
     m = model
     # create optimizer
     opt = mx.optimizer.create(optimizer)
@@ -139,13 +154,25 @@ def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, bat
         # decay learning rate
         if iteration % 50 == 0 and iteration > 0:
             opt.lr *= 0.5
-            print >> sys.stderr, 'reset learning rate to %g' % opt.lr
+            print >> logs, 'reset learning rate to %g' % opt.lr
 
         # end of training loop
         toc = time.time()
-        print >> sys.stderr, 'Iter [%d] Train: Time: %.3f, Training Accuracy: %.3f' % (iteration, toc - tic, num_correct * 100 / float(num_total))
+        train_time = toc - tic
+        train_acc = num_correct * 100 / float(num_total)
+
+        # saving checkpoint
+        if (iteration + 1) % 10 == 0:
+            prefix = 'cnn'
+            m.symbol.save('checkpoint/%s-symbol.json' % prefix)
+            save_dict = {('arg:%s' % k) :v  for k, v in m.cnn_exec.arg_dict.items()}
+            save_dict.update({('aux:%s' % k) : v for k, v in m.cnn_exec.aux_dict.items()})
+            param_name = 'checkpoint/%s-%04d.params' % (prefix, iteration)
+            mx.nd.save(param_name, save_dict)
+            print >> logs, 'Saved checkpoint to %s' % param_name
+
 
-        # eval on dev set
+        # evaluate on dev set
         num_correct = 0
         num_total = 0
         for begin in range(0, X_dev_batch.shape[0], batch_size):
@@ -161,7 +188,9 @@ def train_cnn(model, X_train_batch, y_train_batch, X_dev_batch, y_dev_batch, bat
             num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
             num_total += len(batchY)
 
-        print >> sys.stderr, 'Dev Accuracy thus far: %.3f' % ( num_correct * 100 / float(num_total) )
+        dev_acc = num_correct * 100 / float(num_total)
+        print >> logs, 'Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
+                --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc)
 
 
 def main():
@@ -170,7 +199,6 @@ def main():
     word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
     x, y = data_helpers.load_data_with_word2vec(word2vec)
 
-
     # randomly shuffle data
     np.random.seed(10)
     shuffle_indices = np.random.permutation(np.arange(len(y)))
@@ -194,9 +222,38 @@ def main():
     print 'embedding size', num_embed
     batch_size = 50
 
-    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, dropout=0.5)
+    cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5)
+    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
+
+def train_without_pretrained_embedding():
+    x, y, vocab, vocab_inv = data_helpers.load_data()
+    vocab_size = len(vocab)
+
+    # randomly shuffle data
+    np.random.seed(10)
+    shuffle_indices = np.random.permutation(np.arange(len(y)))
+    x_shuffled = x[shuffle_indices]
+    y_shuffled = y[shuffle_indices]
+
+    # split train/dev set
+    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
+    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
+    print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
+    print 'train shape:', x_train.shape
+    print 'dev shape:', x_dev.shape
+    print 'vocab_size', vocab_size
+   
+    batch_size = 50
+    num_embed = 300
+    sentence_size = x_train.shape[1]
+
+    print 'batch size', batch_size
+    print 'sentence max words', sentence_size
+    print 'embedding size', num_embed
+
+    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, vocab_size, dropout=0.5, with_embedding=False)
     train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)
 
 
 if __name__ == '__main__':
-    main()
+    train_without_pretrained_embedding()
diff --git a/example/image-classification/symbol_inception-bn-28-small.py b/example/image-classification/symbol_inception-bn-28-small.py
index bc934c377b5a..b5a2afce2c1c 100644
--- a/example/image-classification/symbol_inception-bn-28-small.py
+++ b/example/image-classification/symbol_inception-bn-28-small.py
@@ -17,7 +17,7 @@ def DownsampleFactory(data, ch_3x3, mirror_attr):
     # conv 3x3
     conv = ConvFactory(data=data, kernel=(3, 3), stride=(2, 2), num_filter=ch_3x3, pad=(1, 1), mirror_attr=mirror_attr)
     # pool
-    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type='max', attr=mirror_attr)
+    pool = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', attr=mirror_attr)
     # concat
     concat = mx.symbol.Concat(*[conv, pool])
     return concat
diff --git a/example/image-classification/symbol_inception-bn-full.py b/example/image-classification/symbol_inception-bn-full.py
index 27f6bebd9815..de87cf8ebe42 100644
--- a/example/image-classification/symbol_inception-bn-full.py
+++ b/example/image-classification/symbol_inception-bn-full.py
@@ -37,7 +37,7 @@ def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
     cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
     cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
     # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type="max", name=('max_pool_%s_pool' % name))
     # concat
     concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
     return concat
diff --git a/example/image-classification/symbol_inception-bn.py b/example/image-classification/symbol_inception-bn.py
index 985ede4a4a19..c3a2fa8d08ae 100644
--- a/example/image-classification/symbol_inception-bn.py
+++ b/example/image-classification/symbol_inception-bn.py
@@ -45,7 +45,7 @@ def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
     cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
     cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
     # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type="max", name=('max_pool_%s_pool' % name))
+    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type="max", name=('max_pool_%s_pool' % name))
     # concat
     concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
     return concat
diff --git a/example/image-classification/train_cifar10.py b/example/image-classification/train_cifar10.py
index aa5e2e1b571c..dc3580cd3181 100644
--- a/example/image-classification/train_cifar10.py
+++ b/example/image-classification/train_cifar10.py
@@ -9,7 +9,7 @@
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, default='cifar10/',
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=60000,
                     help='the number of training examples')
@@ -40,7 +40,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')) :
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip")
+        os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip")
         os.system("unzip -u cifar10.zip")
         os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
     os.chdir("..")
diff --git a/example/image-classification/train_cifar10_mirroring.py b/example/image-classification/train_cifar10_mirroring.py
index 81124a2f1776..24ded036bd71 100644
--- a/example/image-classification/train_cifar10_mirroring.py
+++ b/example/image-classification/train_cifar10_mirroring.py
@@ -8,7 +8,7 @@
 # documentation could be expected when this feature is mature.
 #
 # When mirroring is turned on and set properly, we could expect smaller memory
-# consumption with slightly slower computation speed (due to extra forward 
+# consumption with slightly slower computation speed (due to extra forward
 # steps). We are not including a sample running log here, as this test case
 # is only a functionality test. The using of pycuda GPU memory query is also
 # not very good way of measuring the memory usage here.
@@ -24,7 +24,7 @@
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, default='cifar10/',
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=60000,
                     help='the number of training examples')
@@ -55,7 +55,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')) :
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip")
+        os.system("wget http://data.dmlc.ml/mxnet/data/data/cifar10.zip")
         os.system("unzip -u cifar10.zip")
         os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
     os.chdir("..")
diff --git a/example/image-classification/train_cifar10_resnet.py b/example/image-classification/train_cifar10_resnet.py
index a90acc4b0aed..b85ffd35c434 100644
--- a/example/image-classification/train_cifar10_resnet.py
+++ b/example/image-classification/train_cifar10_resnet.py
@@ -49,7 +49,7 @@
 parser = argparse.ArgumentParser(description='train an image classifer on cifar10')
 parser.add_argument('--data-dir', type=str, default='cifar10/',
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=50000,
                     help='the number of training examples')
@@ -79,7 +79,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')):
-        os.system('wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip')
+        os.system('wget http://data.dmlc.ml/mxnet/data/cifar10.zip')
         os.system('unzip -u cifar10.zip')
         os.system('mv cifar/* .; rm -rf cifar; rm cifar10.zip')
     os.chdir('..')
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index e53c607bc1a7..394231bd3acc 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -30,7 +30,7 @@
                     help="load the model on an epoch using the model-prefix")
 parser.add_argument('--batch-size', type=int, default=32,
                     help='the batch size')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--kv-store', type=str, default='local',
                     help='the kvstore type')
diff --git a/example/image-classification/train_mnist.R b/example/image-classification/train_mnist.R
index e4fde087b174..4d80512a8e92 100644
--- a/example/image-classification/train_mnist.R
+++ b/example/image-classification/train_mnist.R
@@ -4,11 +4,11 @@ require(mxnet)
 download_ <- function(data_dir) {
     dir.create(data_dir, showWarnings = FALSE)
     setwd(data_dir)
-    if ((!file.exists('train-images-idx3-ubyte')) || 
+    if ((!file.exists('train-images-idx3-ubyte')) ||
         (!file.exists('train-labels-idx1-ubyte')) ||
         (!file.exists('t10k-images-idx3-ubyte')) ||
         (!file.exists('t10k-labels-idx1-ubyte'))) {
-        download.file(url='http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip',
+        download.file(url='http://data.dmlc.ml/mxnet/data/mnist.zip',
                       destfile='mnist.zip', method='wget')
         unzip("mnist.zip")
         file.remove("mnist.zip")
@@ -83,7 +83,7 @@ get_iterator <- function(data_shape) {
 }
 
 parse_args <- function() {
-    parser <- ArgumentParser(description='train an image classifer on mnist')  
+    parser <- ArgumentParser(description='train an image classifer on mnist')
     parser$add_argument('--network', type='character', default='mlp',
                         choices = c('mlp', 'lenet'),
                         help = 'the cnn to use')
diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
index 5b6fa3c05b60..fd7c00cfac25 100644
--- a/example/image-classification/train_mnist.py
+++ b/example/image-classification/train_mnist.py
@@ -12,10 +12,25 @@ def _download(data_dir):
        (not os.path.exists('train-labels-idx1-ubyte')) or \
        (not os.path.exists('t10k-images-idx3-ubyte')) or \
        (not os.path.exists('t10k-labels-idx1-ubyte')):
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip")
+        os.system("wget http://data.dmlc.ml/mxnet/data/mnist.zip")
         os.system("unzip -u mnist.zip; rm mnist.zip")
     os.chdir("..")
 
+def get_loc(data, attr={'lr_mult':'0.01'}):
+    """
+    the localisation network in lenet-stn, it will increase acc about more than 1%,
+    when num-epoch >=15
+    """
+    loc = mx.symbol.Convolution(data=data, num_filter=30, kernel=(5, 5), stride=(2,2))
+    loc = mx.symbol.Activation(data = loc, act_type='relu')
+    loc = mx.symbol.Pooling(data=loc, kernel=(2, 2), stride=(2, 2), pool_type='max')
+    loc = mx.symbol.Convolution(data=loc, num_filter=60, kernel=(3, 3), stride=(1,1), pad=(1, 1))
+    loc = mx.symbol.Activation(data = loc, act_type='relu')
+    loc = mx.symbol.Pooling(data=loc, global_pool=True, kernel=(2, 2), pool_type='avg')
+    loc = mx.symbol.Flatten(data=loc)
+    loc = mx.symbol.FullyConnected(data=loc, num_hidden=6, name="stn_loc", attr=attr)
+    return loc
+
 def get_mlp():
     """
     multi-layer perceptron
@@ -29,13 +44,16 @@ def get_mlp():
     mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
     return mlp
 
-def get_lenet():
+def get_lenet(add_stn=False):
     """
     LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
     Haffner. "Gradient-based learning applied to document recognition."
     Proceedings of the IEEE (1998)
     """
     data = mx.symbol.Variable('data')
+    if(add_stn):
+        data = mx.sym.SpatialTransformer(data=data, loc=get_loc(data), target_shape = (28,28),
+                                         transform_type="affine", sampler_type="bilinear")
     # first conv
     conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
     tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
@@ -88,7 +106,7 @@ def get_iterator_impl(args, kv):
 def parse_args():
     parser = argparse.ArgumentParser(description='train an image classifer on mnist')
     parser.add_argument('--network', type=str, default='mlp',
-                        choices = ['mlp', 'lenet'],
+                        choices = ['mlp', 'lenet', 'lenet-stn'],
                         help = 'the cnn to use')
     parser.add_argument('--data-dir', type=str, default='mnist/',
                         help='the input data directory')
@@ -124,6 +142,9 @@ def parse_args():
     if args.network == 'mlp':
         data_shape = (784, )
         net = get_mlp()
+    elif args.network == 'lenet-stn':
+        data_shape = (1, 28, 28)
+        net = get_lenet(True)
     else:
         data_shape = (1, 28, 28)
         net = get_lenet()
diff --git a/example/kaggle-ndsb1/predict_dsb.py b/example/kaggle-ndsb1/predict_dsb.py
index 9fd3c71d6bb2..5241730120c4 100644
--- a/example/kaggle-ndsb1/predict_dsb.py
+++ b/example/kaggle-ndsb1/predict_dsb.py
@@ -10,7 +10,7 @@
                     help='the batch size')
 parser.add_argument('--data-dir', type=str, default="data48/",
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--model-prefix', type=str,default= "./models/sample_net-0",
                     help='the prefix of the model to load')
diff --git a/example/kaggle-ndsb1/train_dsb.py b/example/kaggle-ndsb1/train_dsb.py
index eeb57bed0a0e..6c54a0b0270e 100644
--- a/example/kaggle-ndsb1/train_dsb.py
+++ b/example/kaggle-ndsb1/train_dsb.py
@@ -27,7 +27,7 @@
                     help="load the model on an epoch using the model-prefix")
 parser.add_argument('--batch-size', type=int, default=64,
                     help='the batch size')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--kv-store', type=str, default='local',
                     help='the kvstore type')
diff --git a/example/module/train_cifar10.py b/example/module/train_cifar10.py
index 804779dbce28..2603f9cf704f 100644
--- a/example/module/train_cifar10.py
+++ b/example/module/train_cifar10.py
@@ -15,7 +15,7 @@
                     help = 'the cnn to use')
 parser.add_argument('--data-dir', type=str, default=default_data_dir,
                     help='the input data directory')
-parser.add_argument('--gpus', type=str, default='0',
+parser.add_argument('--gpus', type=str,
                     help='the gpus will be used, e.g "0,1,2,3"')
 parser.add_argument('--num-examples', type=int, default=60000,
                     help='the number of training examples')
@@ -52,7 +52,7 @@ def _download(data_dir):
     os.chdir(data_dir)
     if (not os.path.exists('train.rec')) or \
        (not os.path.exists('test.rec')) :
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip")
+           os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip")
         os.system("unzip -u cifar10.zip")
         os.system("mv cifar/* .; rm -rf cifar; rm cifar10.zip")
     os.chdir(cwd)
diff --git a/example/neural-style/README.md b/example/neural-style/README.md
index 50402cc4de07..31c29c3513e3 100644
--- a/example/neural-style/README.md
+++ b/example/neural-style/README.md
@@ -21,3 +21,5 @@ It takes 30 secs for a Titan X to generate the above 600x400 image.
 * The current implementation is based the
   [torch implementation](https://github.com/jcjohnson/neural-style). But we may
   change it dramatically in the near future.
+
+* We will release multi-GPU version soon.
diff --git a/example/neural-style/end_to_end/README.md b/example/neural-style/end_to_end/README.md
new file mode 100644
index 000000000000..2f19bf51abe4
--- /dev/null
+++ b/example/neural-style/end_to_end/README.md
@@ -0,0 +1,20 @@
+# End to End Neural Art
+
+This is an implementation of blog: [http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html](http://dmlc.ml/mxnet/2016/06/20/end-to-end-neural-style.html)
+
+
+We will release a Multi-GPU training code soon.
+
+## How to use
+
+
+1. First use `download.sh` to download pre-trained model and sample inputs
+
+2. Then prepare training dataset according to the blog
+
+3. Modify [boost_train.py](boost_train.py)
+
+## Pretrained Model
+
+Weight [https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip](https://github.com/dmlc/web-data/raw/master/mxnet/art/model.zip)
+Inference [boost_inference.py](boost_inference.py)
diff --git a/example/neural-style/end_to_end/basic.py b/example/neural-style/end_to_end/basic.py
new file mode 100644
index 000000000000..ed9d3f601554
--- /dev/null
+++ b/example/neural-style/end_to_end/basic.py
@@ -0,0 +1,162 @@
+import sys
+sys.path.insert(0, "../../mxnet/python/")
+
+import mxnet as mx
+import numpy as np
+import model_vgg19 as vgg
+
+class PretrainedInit(mx.init.Initializer):
+    def __init__(self, prefix, params, verbose=False):
+        self.prefix_len = len(prefix) + 1
+        self.verbose = verbose
+        self.arg_params = {k : v for k, v in params.items() if k.startswith("arg:")}
+        self.aux_params = {k : v for k, v in params.items() if k.startswith("aux:")}
+        self.arg_names = set([k[4:] for k in self.arg_params.keys()])
+        self.aux_names = set([k[4:] for k in self.aux_params.keys()])
+
+    def __call__(self, name, arr):
+        key = name[self.prefix_len:]
+        if key in self.arg_names:
+            if self.verbose:
+                print("Init %s" % name)
+            self.arg_params["arg:" + key].copyto(arr)
+        elif key in self.aux_params:
+            if self.verbose:
+                print("Init %s" % name)
+            self.aux_params["aux:" + key].copyto(arr)
+        else:
+            print("Unknown params: %s, init with 0" % name)
+            arr[:] = 0.
+
+
+def style_gram_symbol(input_shape, style):
+    _, output_shapes, _ = style.infer_shape(**input_shape)
+    gram_list = []
+    grad_scale = []
+    for i in range(len(style.list_outputs())):
+        shape = output_shapes[i]
+        x = mx.sym.Reshape(style[i], shape=(int(shape[1]), int(np.prod(shape[2:]))))
+        # use fully connected to quickly do dot(x, x^T)
+        gram = mx.sym.FullyConnected(x, x, no_bias=True, num_hidden=shape[1])
+        gram_list.append(gram)
+        grad_scale.append(np.prod(shape[1:]) * shape[1])
+    return mx.sym.Group(gram_list), grad_scale
+
+
+def get_loss(gram, content):
+    gram_loss = []
+    for i in range(len(gram.list_outputs())):
+        gvar = mx.sym.Variable("target_gram_%d" % i)
+        gram_loss.append(mx.sym.sum(mx.sym.square(gvar - gram[i])))
+    cvar = mx.sym.Variable("target_content")
+    content_loss = mx.sym.sum(mx.sym.square(cvar - content))
+    return mx.sym.Group(gram_loss), content_loss
+
+def get_content_module(prefix, dshape, ctx, params):
+    sym = vgg.get_vgg_symbol(prefix, True)
+    init = PretrainedInit(prefix, params)
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False)
+    mod.init_params(init)
+    return mod
+
+def get_style_module(prefix, dshape, ctx, params):
+    input_shape = {"%s_data" % prefix : dshape}
+    style, content = vgg.get_vgg_symbol(prefix)
+    gram, gscale = style_gram_symbol(input_shape, style)
+    init = PretrainedInit(prefix, params)
+    mod = mx.mod.Module(symbol=gram,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False)
+    mod.init_params(init)
+    return mod
+
+
+def get_loss_module(prefix, dshape, ctx, params):
+    input_shape = {"%s_data" % prefix : dshape}
+    style, content = vgg.get_vgg_symbol(prefix)
+    gram, gscale = style_gram_symbol(input_shape, style)
+    style_loss, content_loss = get_loss(gram, content)
+    sym = mx.sym.Group([style_loss, content_loss])
+    init = PretrainedInit(prefix, params)
+    gram_size = len(gram.list_outputs())
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    mod.bind(data_shapes=[("%s_data" % prefix, dshape)],
+             for_training=True, inputs_need_grad=True)
+    mod.init_params(init)
+    return mod, gscale
+
+
+
+if __name__ == "__main__":
+    from data_processing import PreprocessContentImage, PreprocessStyleImage
+    from data_processing import PostprocessImage, SaveImage
+    vgg_params = mx.nd.load("./model/vgg19.params")
+    style_weight = 2
+    content_weight = 10
+    long_edge = 384
+    content_np = PreprocessContentImage("./input/IMG_4343.jpg", long_edge)
+    style_np = PreprocessStyleImage("./input/starry_night.jpg", shape=content_np.shape)
+    dshape = content_np.shape
+    ctx = mx.gpu()
+    # style
+    style_mod = get_style_module("style", dshape, ctx, vgg_params)
+    style_mod.forward(mx.io.DataBatch([mx.nd.array(style_np)], [0]), is_train=False)
+    style_array = [arr.copyto(mx.cpu()) for arr in style_mod.get_outputs()]
+    del style_mod
+    # content
+    content_mod = get_content_module("content", dshape, ctx, vgg_params)
+    content_mod.forward(mx.io.DataBatch([mx.nd.array(content_np)], [0]), is_train=False)
+    content_array = content_mod.get_outputs()[0].copyto(mx.cpu())
+    del content_mod
+    # loss
+    mod, gscale = get_loss_module("loss", dshape, ctx, vgg_params)
+    extra_args = {"target_gram_%d" % i : style_array[i] for i in range(len(style_array))}
+    extra_args["target_content"] = content_array
+    mod.set_params(extra_args, {}, True, True)
+    grad_array = []
+    for i in range(len(style_array)):
+        grad_array.append(mx.nd.ones((1,), ctx) * (float(style_weight) / gscale[i]))
+    grad_array.append(mx.nd.ones((1,), ctx) * (float(content_weight)))
+    # train
+    img = mx.nd.zeros(content_np.shape, ctx=ctx)
+    img[:] = mx.rnd.uniform(-0.1, 0.1, img.shape)
+    lr = mx.lr_scheduler.FactorScheduler(step=80, factor=.9)
+    optimizer = mx.optimizer.SGD(
+            learning_rate = 0.001,
+            wd = 0.0005,
+            momentum=0.9,
+            lr_scheduler = lr)
+    optim_state = optimizer.create_state(0, img)
+
+    old_img = img.copyto(ctx)
+    clip_norm = 1 * np.prod(img.shape)
+
+    import logging
+    for e in range(800):
+        mod.forward(mx.io.DataBatch([img], [0]), is_train=True)
+        mod.backward(grad_array)
+        data_grad = mod.get_input_grads()[0]
+        gnorm = mx.nd.norm(data_grad).asscalar()
+        if gnorm > clip_norm:
+            print("Data Grad: ", gnorm / clip_norm)
+            data_grad[:] *= clip_norm / gnorm
+
+        optimizer.update(0, img, data_grad, optim_state)
+        new_img = img
+        eps = (mx.nd.norm(old_img - new_img) / mx.nd.norm(new_img)).asscalar()
+        old_img = new_img.copyto(ctx)
+        logging.info('epoch %d, relative change %f', e, eps)
+        if (e+1) % 50 == 0:
+            SaveImage(new_img.asnumpy(), 'output/tmp_'+str(e+1)+'.jpg')
+
+    SaveImage(new_img.asnumpy(), "./output/out.jpg")
+
diff --git a/example/neural-style/end_to_end/boost_inference.py b/example/neural-style/end_to_end/boost_inference.py
new file mode 100644
index 000000000000..72427bedc7a6
--- /dev/null
+++ b/example/neural-style/end_to_end/boost_inference.py
@@ -0,0 +1,38 @@
+import sys
+sys.path.insert(0, "../mxnet/python")
+
+import mxnet as mx
+import numpy as np
+
+#import basic
+import data_processing
+import gen_v3
+import gen_v4
+
+dshape = (1, 3, 480, 640)
+clip_norm = 1.0 * np.prod(dshape)
+model_prefix = "./model/"
+ctx = mx.gpu(0)
+
+
+
+# generator
+gens = [gen_v4.get_module("g0", dshape, ctx),
+        gen_v3.get_module("g1", dshape, ctx),
+        gen_v3.get_module("g2", dshape, ctx),
+        gen_v4.get_module("g3", dshape, ctx)]
+for i in range(len(gens)):
+    gens[i].load_params("./model/%d/v3_0002-0026000.params" % i)
+
+content_np = data_processing.PreprocessContentImage("../IMG_4343.jpg", min(dshape[2:]), dshape)
+data = [mx.nd.array(content_np)]
+for i in range(len(gens)):
+    gens[i].forward(mx.io.DataBatch([data[-1]], [0]), is_train=False)
+    new_img = gens[i].get_outputs()[0]
+    data.append(new_img.copyto(mx.cpu()))
+    data_processing.SaveImage(new_img.asnumpy(), "out_%d.jpg" % i)
+
+
+import os
+os.system("rm -rf out.zip")
+os.system("zip out.zip out_*")
diff --git a/example/neural-style/end_to_end/boost_train.py b/example/neural-style/end_to_end/boost_train.py
new file mode 100644
index 000000000000..9100cc1875a2
--- /dev/null
+++ b/example/neural-style/end_to_end/boost_train.py
@@ -0,0 +1,147 @@
+import sys
+sys.path.insert(0, "../../mxnet/python")
+
+import mxnet as mx
+import numpy as np
+
+import basic
+import data_processing
+import gen_v3
+import gen_v4
+
+# params
+vgg_params = mx.nd.load("./vgg19.params")
+style_weight = 1.2
+content_weight = 10
+dshape = (1, 3, 384, 384)
+clip_norm = 0.05 * np.prod(dshape)
+model_prefix = "v3"
+ctx = mx.gpu(0)
+
+# init style
+style_np = data_processing.PreprocessStyleImage("../starry_night.jpg", shape=dshape)
+style_mod = basic.get_style_module("style", dshape, ctx, vgg_params)
+style_mod.forward(mx.io.DataBatch([mx.nd.array(style_np)], [0]), is_train=False)
+style_array = [arr.copyto(mx.cpu()) for arr in style_mod.get_outputs()]
+del style_mod
+
+# content
+content_mod = basic.get_content_module("content", dshape, ctx, vgg_params)
+
+# loss
+loss, gscale = basic.get_loss_module("loss", dshape, ctx, vgg_params)
+extra_args = {"target_gram_%d" % i : style_array[i] for i in range(len(style_array))}
+loss.set_params(extra_args, {}, True, True)
+grad_array = []
+for i in range(len(style_array)):
+    grad_array.append(mx.nd.ones((1,), ctx) * (float(style_weight) / gscale[i]))
+grad_array.append(mx.nd.ones((1,), ctx) * (float(content_weight)))
+
+# generator
+gens = [gen_v4.get_module("g0", dshape, ctx),
+        gen_v3.get_module("g1", dshape, ctx),
+        gen_v3.get_module("g2", dshape, ctx),
+        gen_v4.get_module("g3", dshape, ctx)]
+for gen in gens:
+    gen.init_optimizer(
+        optimizer='sgd',
+        optimizer_params={
+            'learning_rate': 1e-4,
+            'momentum' : 0.9,
+            'wd': 5e-3,
+            'clip_gradient' : 5.0
+        })
+
+
+# tv-loss
+def get_tv_grad_executor(img, ctx, tv_weight):
+    """create TV gradient executor with input binded on img
+    """
+    if tv_weight <= 0.0:
+        return None
+    nchannel = img.shape[1]
+    simg = mx.sym.Variable("img")
+    skernel = mx.sym.Variable("kernel")
+    channels = mx.sym.SliceChannel(simg, num_outputs=nchannel)
+    out = mx.sym.Concat(*[
+        mx.sym.Convolution(data=channels[i], weight=skernel,
+                           num_filter=1,
+                           kernel=(3, 3), pad=(1,1),
+                           no_bias=True, stride=(1,1))
+        for i in range(nchannel)])
+    kernel = mx.nd.array(np.array([[0, -1, 0],
+                                   [-1, 4, -1],
+                                   [0, -1, 0]])
+                         .reshape((1, 1, 3, 3)),
+                         ctx) / 8.0
+    out = out * tv_weight
+    return out.bind(ctx, args={"img": img,
+                               "kernel": kernel})
+tv_weight = 1e-2
+
+start_epoch = 0
+end_epoch = 3
+
+
+# data
+import os
+import random
+import logging
+
+data_root = "../data/"
+file_list = os.listdir(data_root)
+num_image = len(file_list)
+logging.info("Dataset size: %d" % num_image)
+
+
+# train
+
+for i in range(start_epoch, end_epoch):
+    random.shuffle(file_list)
+    for idx in range(num_image):
+        loss_grad_array = []
+        data_array = []
+        path = data_root + file_list[idx]
+        content_np = data_processing.PreprocessContentImage(path, min(dshape[2:]), dshape)
+        data = mx.nd.array(content_np)
+        data_array.append(data)
+        # get content
+        content_mod.forward(mx.io.DataBatch([data], [0]), is_train=False)
+        content_array = content_mod.get_outputs()[0].copyto(mx.cpu())
+        # set target content
+        loss.set_params({"target_content" : content_array}, {}, True, True)
+        # gen_forward
+        for k in range(len(gens)):
+            gens[k].forward(mx.io.DataBatch([data_array[-1]], [0]), is_train=True)
+            data_array.append(gens[k].get_outputs()[0].copyto(mx.cpu()))
+            # loss forward
+            loss.forward(mx.io.DataBatch([data_array[-1]], [0]), is_train=True)
+            loss.backward(grad_array)
+            grad = loss.get_input_grads()[0]
+            loss_grad_array.append(grad.copyto(mx.cpu()))
+        grad = mx.nd.zeros(data.shape)
+        for k in range(len(gens) - 1, -1, -1):
+            tv_grad_executor = get_tv_grad_executor(gens[k].get_outputs()[0],
+                    ctx, tv_weight)
+            tv_grad_executor.forward()
+
+            grad[:] += loss_grad_array[k] + tv_grad_executor.outputs[0].copyto(mx.cpu())
+            gnorm = mx.nd.norm(grad).asscalar()
+            if gnorm > clip_norm:
+                grad[:] *= clip_norm / gnorm
+
+            gens[k].backward([grad])
+            gens[k].update()
+        if idx % 20 == 0:
+            logging.info("Epoch %d: Image %d" % (i, idx))
+            for k in range(len(gens)):
+                logging.info("Data Norm :%.5f" %\
+                        (mx.nd.norm(gens[k].get_input_grads()[0]).asscalar() / np.prod(dshape)))
+        if idx % 1000 == 0:
+            for k in range(len(gens)):
+                gens[k].save_params("./model/%d/%s_%04d-%07d.params" % (k, model_prefix, i, idx))
+
+
+
+
+
diff --git a/example/neural-style/end_to_end/data_processing.py b/example/neural-style/end_to_end/data_processing.py
new file mode 100644
index 000000000000..5469fb008d7a
--- /dev/null
+++ b/example/neural-style/end_to_end/data_processing.py
@@ -0,0 +1,67 @@
+import numpy as np
+from skimage import io, transform
+from skimage.restoration import denoise_tv_chambolle
+import logging
+import random
+FORMAT = '%(asctime)-15s %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+def PreprocessContentImage(path, short_edge, dshape=None):
+    img = io.imread(path)
+    #logging.info("load the content image, size = %s", img.shape[:2])
+    factor = float(short_edge) / min(img.shape[:2])
+    new_size = (int(img.shape[0] * factor), int(img.shape[1] * factor))
+    resized_img = transform.resize(img, new_size)
+    sample = np.asarray(resized_img) * 256
+    if dshape != None:
+        # random crop
+        xx = int((sample.shape[0] - dshape[2]))
+        yy = int((sample.shape[1] - dshape[3]))
+        xstart = random.randint(0, xx)
+        ystart = random.randint(0, yy)
+        xend = xstart + dshape[2]
+        yend = ystart + dshape[3]
+        sample = sample[xstart:xend, ystart:yend, :]
+
+    # swap axes to make image from (224, 224, 3) to (3, 224, 224)
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+    # sub mean
+    sample[0, :] -= 123.68
+    sample[1, :] -= 116.779
+    sample[2, :] -= 103.939
+    #logging.info("resize the content image to %s", sample.shape)
+    return np.resize(sample, (1, 3, sample.shape[1], sample.shape[2]))
+
+def PreprocessStyleImage(path, shape):
+    img = io.imread(path)
+    resized_img = transform.resize(img, (shape[2], shape[3]))
+    sample = np.asarray(resized_img) * 256
+    sample = np.swapaxes(sample, 0, 2)
+    sample = np.swapaxes(sample, 1, 2)
+
+    sample[0, :] -= 123.68
+    sample[1, :] -= 116.779
+    sample[2, :] -= 103.939
+    return np.resize(sample, (1, 3, sample.shape[1], sample.shape[2]))
+
+def PostprocessImage(img):
+    img = np.resize(img, (3, img.shape[2], img.shape[3]))
+    img[0, :] += 123.68
+    img[1, :] += 116.779
+    img[2, :] += 103.939
+    img = np.swapaxes(img, 1, 2)
+    img = np.swapaxes(img, 0, 2)
+    img = np.clip(img, 0, 255)
+    return img.astype('uint8')
+
+def SaveImage(img, filename, remove_noise=0.02):
+    logging.info('save output to %s', filename)
+    out = PostprocessImage(img)
+    if remove_noise != 0.0:
+        out = denoise_tv_chambolle(out, weight=remove_noise, multichannel=True)
+    io.imsave(filename, out)
+
+
+
+
diff --git a/example/neural-style/end_to_end/gen_v3.py b/example/neural-style/end_to_end/gen_v3.py
new file mode 100644
index 000000000000..dbc83b1ea004
--- /dev/null
+++ b/example/neural-style/end_to_end/gen_v3.py
@@ -0,0 +1,72 @@
+
+# coding: utf-8
+
+# In[1]:
+
+import sys
+sys.path.insert(0, "../../mxnet/python")
+
+
+# In[2]:
+
+import mxnet as mx
+import numpy as np
+
+
+def Conv(data, num_filter, kernel=(5, 5), pad=(2, 2), stride=(2, 2)):
+    sym = mx.sym.Convolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=False)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    return sym
+
+
+def Deconv(data, num_filter, im_hw, kernel=(7, 7), pad=(2, 2), stride=(2, 2), crop=True, out=False):
+    sym = mx.sym.Deconvolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True)
+    if crop:
+        sym = mx.sym.Crop(sym, offset=(1, 1), h_w=im_hw, num_args=1)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    if out == False:
+        sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    else:
+        sym = mx.sym.Activation(sym, act_type="tanh")
+    return sym
+
+# In[70]:
+
+def get_generator(prefix, im_hw):
+    data = mx.sym.Variable("%s_data" % prefix)
+    conv1 = Conv(data, 64) # 192
+    conv1_1 = Conv(conv1, 48, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    conv2 = Conv(conv1_1, 128) # 96
+    conv2_1 = Conv(conv2, 96, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    conv3 = Conv(conv2_1, 256) # 48
+    conv3_1 = Conv(conv3, 192, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    deconv1 = Deconv(conv3_1, 128, (int(im_hw[0] / 4), int(im_hw[1] / 4))) + conv2
+    conv4_1 = Conv(deconv1, 160, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    deconv2 = Deconv(conv4_1, 64, (int(im_hw[0] / 2), int(im_hw[1] / 2))) + conv1
+    conv5_1 = Conv(deconv2, 96, kernel=(3, 3), pad=(1, 1), stride=(1, 1))
+    deconv3 = Deconv(conv5_1, 3, im_hw, kernel=(8,  8), pad=(3, 3), out=True, crop=False)
+    raw_out = (deconv3 * 128) + 128
+    norm = mx.sym.SliceChannel(raw_out, num_outputs=3)
+    r_ch = norm[0] - 123.68
+    g_ch = norm[1] - 116.779
+    b_ch = norm[2] - 103.939
+    norm_out = 0.4 * mx.sym.Concat(*[r_ch, g_ch, b_ch]) + 0.6 * data
+    return norm_out
+
+def get_module(prefix, dshape, ctx, is_train=True):
+    sym = get_generator(prefix, dshape[-2:])
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    if is_train:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=True, inputs_need_grad=True)
+    else:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False, inputs_need_grad=False)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    return mod
+
+
+
+
diff --git a/example/neural-style/end_to_end/gen_v4.py b/example/neural-style/end_to_end/gen_v4.py
new file mode 100644
index 000000000000..379e904b9690
--- /dev/null
+++ b/example/neural-style/end_to_end/gen_v4.py
@@ -0,0 +1,86 @@
+
+# coding: utf-8
+
+# In[1]:
+
+import sys
+sys.path.insert(0, "../mxnet/python")
+
+
+# In[2]:
+
+import mxnet as mx
+import numpy as np
+
+
+def Conv(data, num_filter, kernel=(5, 5), pad=(2, 2), stride=(2, 2)):
+    sym = mx.sym.Convolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=False)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    return sym
+
+
+def Deconv(data, num_filter, kernel=(6, 6), pad=(2, 2), stride=(2, 2), out=False):
+    sym = mx.sym.Deconvolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True)
+    sym = mx.sym.BatchNorm(sym, fix_gamma=False)
+    if out == False:
+        sym = mx.sym.LeakyReLU(sym, act_type="leaky")
+    else:
+        sym = mx.sym.Activation(sym, act_type="tanh")
+    return sym
+
+# In[70]:
+
+def get_generator(prefix, im_hw):
+    data = mx.sym.Variable("%s_data" % prefix)
+
+    conv1_1 = mx.sym.Convolution(data, num_filter=48, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv1_1 = mx.sym.BatchNorm(conv1_1, fix_gamma=False)
+    conv1_1 = mx.sym.LeakyReLU(conv1_1, act_type="leaky")
+
+    conv2_1 = mx.sym.Convolution(conv1_1, num_filter=32, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv2_1 = mx.sym.BatchNorm(conv2_1, fix_gamma=False)
+    conv2_1 = mx.sym.LeakyReLU(conv2_1, act_type="leaky")
+
+    conv3_1 = mx.sym.Convolution(conv2_1, num_filter=64, kernel=(3, 3), pad=(1, 1), no_bias=False)
+    conv3_1 = mx.sym.BatchNorm(conv3_1, fix_gamma=False)
+    conv3_1 = mx.sym.LeakyReLU(conv3_1, act_type="leaky")
+
+    conv4_1 = mx.sym.Convolution(conv3_1, num_filter=32, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv4_1 = mx.sym.BatchNorm(conv4_1, fix_gamma=False)
+    conv4_1 = mx.sym.LeakyReLU(conv4_1, act_type="leaky")
+
+    conv5_1 = mx.sym.Convolution(conv4_1, num_filter=48, kernel=(5, 5), pad=(2, 2), no_bias=False)
+    conv5_1 = mx.sym.BatchNorm(conv5_1, fix_gamma=False)
+    conv5_1 = mx.sym.LeakyReLU(conv5_1, act_type="leaky")
+
+    conv6_1 = mx.sym.Convolution(conv5_1, num_filter=32, kernel=(5, 5), pad=(2, 2), no_bias=True)
+    conv6_1 = mx.sym.BatchNorm(conv6_1, fix_gamma=False)
+    conv6_1 = mx.sym.LeakyReLU(conv6_1, act_type="leaky")
+
+    out = mx.sym.Convolution(conv6_1, num_filter=3, kernel=(3, 3), pad=(1, 1), no_bias=True)
+    out = mx.sym.BatchNorm(out, fix_gamma=False)
+    out = mx.sym.Activation(data=out, act_type="tanh")
+    raw_out = (out * 128) + 128
+    norm = mx.sym.SliceChannel(raw_out, num_outputs=3)
+    r_ch = norm[0] - 123.68
+    g_ch = norm[1] - 116.779
+    b_ch = norm[2] - 103.939
+    norm_out = 0.4 * mx.sym.Concat(*[r_ch, g_ch, b_ch]) + 0.6 * data
+    return norm_out
+
+def get_module(prefix, dshape, ctx, is_train=True):
+    sym = get_generator(prefix, dshape[-2:])
+    mod = mx.mod.Module(symbol=sym,
+                        data_names=("%s_data" % prefix,),
+                        label_names=None,
+                        context=ctx)
+    if is_train:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=True, inputs_need_grad=True)
+    else:
+        mod.bind(data_shapes=[("%s_data" % prefix, dshape)], for_training=False, inputs_need_grad=False)
+    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+    return mod
+
+
+
diff --git a/example/neural-style/end_to_end/model_vgg19.py b/example/neural-style/end_to_end/model_vgg19.py
new file mode 100644
index 000000000000..6e287b55b2fa
--- /dev/null
+++ b/example/neural-style/end_to_end/model_vgg19.py
@@ -0,0 +1,96 @@
+import mxnet as mx
+import os, sys
+from collections import namedtuple
+
+ConvExecutor = namedtuple('ConvExecutor', ['executor', 'data', 'data_grad', 'style', 'content', 'arg_dict'])
+
+def get_vgg_symbol(prefix, content_only=False):
+    # declare symbol
+    data = mx.sym.Variable("%s_data" % prefix)
+    conv1_1 = mx.symbol.Convolution(name='%s_conv1_1' % prefix, data=data , num_filter=64, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu1_1 = mx.symbol.Activation(data=conv1_1 , act_type='relu')
+    conv1_2 = mx.symbol.Convolution(name='%s_conv1_2' % prefix, data=relu1_1 , num_filter=64, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu1_2 = mx.symbol.Activation(data=conv1_2 , act_type='relu')
+    pool1 = mx.symbol.Pooling(data=relu1_2 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv2_1 = mx.symbol.Convolution(name='%s_conv2_1' % prefix, data=pool1 , num_filter=128, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu2_1 = mx.symbol.Activation(data=conv2_1 , act_type='relu')
+    conv2_2 = mx.symbol.Convolution(name='%s_conv2_2' % prefix, data=relu2_1 , num_filter=128, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu2_2 = mx.symbol.Activation(data=conv2_2 , act_type='relu')
+    pool2 = mx.symbol.Pooling(data=relu2_2 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv3_1 = mx.symbol.Convolution(name='%s_conv3_1' % prefix, data=pool2 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_1 = mx.symbol.Activation(data=conv3_1 , act_type='relu')
+    conv3_2 = mx.symbol.Convolution(name='%s_conv3_2' % prefix, data=relu3_1 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_2 = mx.symbol.Activation(data=conv3_2 , act_type='relu')
+    conv3_3 = mx.symbol.Convolution(name='%s_conv3_3' % prefix, data=relu3_2 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_3 = mx.symbol.Activation(data=conv3_3 , act_type='relu')
+    conv3_4 = mx.symbol.Convolution(name='%s_conv3_4' % prefix, data=relu3_3 , num_filter=256, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu3_4 = mx.symbol.Activation(data=conv3_4 , act_type='relu')
+    pool3 = mx.symbol.Pooling(data=relu3_4 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv4_1 = mx.symbol.Convolution(name='%s_conv4_1' % prefix, data=pool3 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_1 = mx.symbol.Activation(data=conv4_1 , act_type='relu')
+    conv4_2 = mx.symbol.Convolution(name='%s_conv4_2' % prefix, data=relu4_1 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_2 = mx.symbol.Activation(data=conv4_2 , act_type='relu')
+    conv4_3 = mx.symbol.Convolution(name='%s_conv4_3' % prefix, data=relu4_2 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_3 = mx.symbol.Activation(data=conv4_3 , act_type='relu')
+    conv4_4 = mx.symbol.Convolution(name='%s_conv4_4' % prefix, data=relu4_3 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu4_4 = mx.symbol.Activation(data=conv4_4 , act_type='relu')
+    pool4 = mx.symbol.Pooling(data=relu4_4 , pad=(0,0), kernel=(2,2), stride=(2,2), pool_type='avg')
+    conv5_1 = mx.symbol.Convolution(name='%s_conv5_1' % prefix, data=pool4 , num_filter=512, pad=(1,1), kernel=(3,3), stride=(1,1), workspace=1024)
+    relu5_1 = mx.symbol.Activation(data=conv5_1 , act_type='relu')
+
+
+    if content_only:
+        return relu4_2
+    # style and content layers
+    style = mx.sym.Group([relu1_1, relu2_1, relu3_1, relu4_1, relu5_1])
+    content = mx.sym.Group([relu4_2])
+    return style, content
+
+
+def get_executor_with_style(style, content, input_size, ctx):
+    out = mx.sym.Group([style, content])
+    # make executor
+    arg_shapes, output_shapes, aux_shapes = out.infer_shape(data=(1, 3, input_size[0], input_size[1]))
+    arg_names = out.list_arguments()
+    arg_dict = dict(zip(arg_names, [mx.nd.zeros(shape, ctx=ctx) for shape in arg_shapes]))
+    grad_dict = {"data": arg_dict["data"].copyto(ctx)}
+    # init with pretrained weight
+    pretrained = mx.nd.load("./model/vgg19.params")
+    for name in arg_names:
+        if name == "data":
+            continue
+        key = "arg:" + name
+        if key in pretrained:
+            pretrained[key].copyto(arg_dict[name])
+        else:
+            print("Skip argument %s" % name)
+    executor = out.bind(ctx=ctx, args=arg_dict, args_grad=grad_dict, grad_req="write")
+    return ConvExecutor(executor=executor,
+                        data=arg_dict["data"],
+                        data_grad=grad_dict["data"],
+                        style=executor.outputs[:-1],
+                        content=executor.outputs[-1],
+                        arg_dict=arg_dict)
+
+def get_executor_content(content, input_size, ctx):
+    arg_shapes, output_shapes, aux_shapes = content.infer_shape(data=(1, 3, input_size[0], input_size[1]))
+    arg_names = out.list_arguments()
+    arg_dict = dict(zip(arg_names, [mx.nd.zeros(shape, ctx=ctx) for shape in arg_shapes]))
+    pretrained = mx.nd.load("./model/vgg19.params")
+    for name in arg_names:
+        if name == "data":
+            continue
+        key = "arg:" + name
+        if key in pretrained:
+            pretrained[key].copyto(arg_dict[name])
+        else:
+            print("Skip argument %s" % name)
+    executor = out.bind(ctx=ctx, args=arg_dict, args_grad=[], grad_req="null")
+    return ConvExecutor(executor=executor,
+                        data=arg_dict["data"],
+                        data_grad=None,
+                        style=None,
+                        content=executor.outputs[0],
+                        arg_dict=arg_dict)
+
+
diff --git a/example/notebooks/predict-with-pretrained-model.ipynb b/example/notebooks/predict-with-pretrained-model.ipynb
index 73ba99071890..f85157dc714f 100644
--- a/example/notebooks/predict-with-pretrained-model.ipynb
+++ b/example/notebooks/predict-with-pretrained-model.ipynb
@@ -16,7 +16,7 @@
     "For network structure, you can visualize it in [Composite Symbol Demo](composite_symbol.ipynb)\n",
     "\n",
     "The pre-trained Inception-BatchNorm network is able to be downloaded from:\n",
-    "[http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip](http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip)\n",
+    "[http://data.dmlc.ml/mxnet/data/Inception.zip](http://data.dmlc.ml/mxnet/data/Inception.zip)\n",
     "This model achieves Top-1 Accuracy: 70% and Top-5 Accuracy: 89.9%\n",
     "\n",
     "Note: This network is trained by using very simple augmentation (random flip + random crop). We will release model with a little bit more augmentation (which achieves better validation score)"
diff --git a/example/rcnn/LICENSE b/example/rcnn/LICENSE
index 07b70c57b8d5..84eb07876986 100644
--- a/example/rcnn/LICENSE
+++ b/example/rcnn/LICENSE
@@ -42,6 +42,31 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 
 
+Faster R-CNN
+
+The MIT License (MIT)
+
+Copyright (c) 2015 Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+
 Caffe
 
 COPYRIGHT
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index e35d09cb92e7..60f5527cb907 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -1,4 +1,10 @@
-# Fast R-CNN in MXNet
+# Faster R-CNN in MXNet with distributed implementation and data parallelization
+
+Region Proposal Network solves object detection as a regression problem 
+from the objectness perspective. Bounding boxes are predicted by applying 
+learned bounding box deltas to base boxes, namely anchor boxes across 
+different positions in feature maps. Training process directly learns a 
+mapping from raw image intensities to bounding box transformation targets.
 
 Fast R-CNN treats general object detection as a classification problem and
 bounding box prediction as a regression problem. Classifying cropped region
@@ -7,132 +13,67 @@ detection results. Cropping feature maps instead of image input accelerates
 computation utilizing shared convolution maps. Bounding box displacements
 are simultaneously learned in the training process.
 
+Faster R-CNN utilize an alternate optimization training process between RPN 
+and Fast R-CNN. Fast R-CNN weights are used to initiate RPN for training.
+
 ## Getting Started
+* Install python package `easydict`, `cv2`, `matplotlib`. MXNet require `numpy`.
+* Install MXNet with version no later than Commit 8a3424e, preferably the latest master.
+  Follow the instructions at http://mxnet.readthedocs.io/en/latest/how_to/build.html. Install the python interface.
+* Try out detection result by running `python demo.py --prefix final --epoch 0 --image myimage.jpg --gpu 0`.
+  Suppose you have downloaded pretrained network and place the extracted file `final-0000.params` in this folder and there is an image named `myimage.jpg`.
 
-* MXNet with `ROIPooling` and `smooth_l1` operators are required
-* Download data and place them to `data` folder according to `Data Folder Structure`.
-  You might want to create a symbolic link to VOCdevkit folder
-```
-Pascal VOCdevkit
-http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
-http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
-http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
-Ross's precomputed object proposals
-https://github.com/rbgirshick/fast-rcnn/blob/master/data/scripts/fetch_selective_search_data.sh
-```
-* Data Folder Structure (suppose root is `data`)
-```
-demo
-selective_search_data
-cache (created by imdb)
--- name + source + roidb.pkl (create by imdb)
--- name (created by detection and evaluation)
-VOCdevkit
--- VOC + year (JPEG images and annotations)
--- results (created by evaluation)
----- VOC + year
------- main
--------- comp4_det_val_aeroplane.txt
-```
+## Training and Testing Faster R-CNN
+* Install additional python package `scipy`.
+* Download Pascal VOC data and place them to `data` folder according to `Data Folder Structure`.
+  You might want to create a symbolic link to VOCdevkit folder by `ln -s /path/to/your/VOCdevkit data/VOCdevkit`.
 * Download VGG16 pretrained model, use `mxnet/tools/caffe_converter` to convert it,
-  rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder
-* Download 'demo' data and put it in `data/demo` from
-```
-https://github.com/rbgirshick/fast-rcnn/tree/master/data/demo
-```
-
-## Training
-* Start training by run `python train.py`. Variable args can be found by run
-`python train.py --help`.
-* Training can be done in cpu, modify `train.py` accordingly.
-* Training can be done in multiple gpus.
-```
-usage: train.py [-h] [--image_set IMAGE_SET] [--year YEAR]
-                [--root_path ROOT_PATH] [--devkit_path DEVKIT_PATH]
-                [--pretrained PRETRAINED] [--epoch EPOCH] [--prefix PREFIX]
-                [--gpus GPU_ID] [--begin_epoch BEGIN_EPOCH]
-                [--end_epoch END_EPOCH] [--frequent FREQUENT]
-
-Train a Fast R-CNN network
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --image_set IMAGE_SET
-                        can be trainval or train
-  --year YEAR           can be 2007, 2010, 2012
-  --root_path ROOT_PATH
-                        output data folder
-  --devkit_path DEVKIT_PATH
-                        VOCdevkit path
-  --pretrained PRETRAINED
-                        pretrained model prefix
-  --epoch EPOCH         epoch of pretrained model
-  --prefix PREFIX       new model prefix
-  --gpus GPU_ID         GPU devices to train with
-  --begin_epoch BEGIN_EPOCH
-                        begin epoch of training
-  --end_epoch END_EPOCH
-                        end epoch of training
-  --frequent FREQUENT   frequency of logging
-  --kv_store KV_STORE   kv_store type used in multi-device training
-  --work_load_list WORK_LOAD_LIST
-                        list of work load for different devices
-```
-- Performance in terms of training speed
+  rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder.
+  `model` folder will be used to place model checkpoints along the training process.
+* Start training by running `python train_alternate.py` after VOCdevkit is ready.
+  A typical command would be `python train_alternate.py --gpus 0`. This will train the network on the VOC07 trainval.
+  More control of training process can be found in the argparse help accessed by `python train_alternate.py -h`.
+* Start testing by run `python test.py` after completing the training process.
+  A typical command would be `python test.py --has_rpn --prefix model/final --epoch 8`. This will test the network on the VOC07 test.
+  Adding a `--vis` will turn on visualization and `-h` will show help as in the training process.
 
- | GPUs | batch size | samples per second |
- | --- | --- | --- |
- | 1 | 2 | 3.02 |
- | 2 | 4 | 3.80 |
- | 4 | 8 | 5.96 |
-
-
-## Testing
-* Start testing by run `python test.py`. Variable args can be found by run
-`python test.py --help`.
-* Testing can be done in cpu, modify `test.py` accordingly.
-```
-usage: test.py [-h] [--image_set IMAGE_SET] [--year YEAR]
-               [--root_path ROOT_PATH] [--devkit_path DEVKIT_PATH]
-               [--prefix PREFIX] [--epoch EPOCH] [--gpu GPU_ID]
-
-Test a Fast R-CNN network
+## Training and Testing Fast R-CNN
+* Download Pascal VOC data and place them to `data` folder according to `Data Folder Structure`.
+  You might want to create a symbolic link to VOCdevkit folder by `ln -s /path/to/your/VOCdevkit data/VOCdevkit`.
+* Download precomputed selective search data and place them to `data` folder according to `Data Folder Structure`.
+* Download VGG16 pretrained model, use `mxnet/tools/caffe_converter` to convert it,
+  rename to `vgg16-symbol.json` and `vgg16-0001.params` and place it in `model` folder.
+  `model` folder will be used to place model checkpoints along the training process.
+* Start training by running `python -m tools.train_rcnn --proposal ss` to use the selective search proposal.
+* Start testing by running `python -m tools.test_rcnn --proposal ss`.
 
-optional arguments:
-  -h, --help            show this help message and exit
-  --image_set IMAGE_SET
-                        can be test
-  --year YEAR           can be 2007, 2010, 2012
-  --root_path ROOT_PATH
-                        output data folder
-  --devkit_path DEVKIT_PATH
-                        VOCdevkit path
-  --prefix PREFIX       new model prefix
-  --epoch EPOCH         epoch of pretrained model
-  --gpu GPU_ID          GPU device to test with
-```
+## Information
+* Download link to trained model
+  Baidu Yun: http://pan.baidu.com/s/1boRhGvH (ixiw) or Dropbox: https://www.dropbox.com/s/jrr83q0ai2ckltq/final-0000.params.tar.gz?dl=0
+* Download link to Pascal VOC and precomputed selective search proposals
 
-## Demonstration
-* If no training has been done, download reference model from Ross Girshick and use
-`mxnet/caffe/caffe_converter` to convert it to MXNet.
-```
-https://github.com/rbgirshick/fast-rcnn/blob/master/data/scripts/fetch_fast_rcnn_models.sh
-```
-* Run demo by `demo.py --gpu 0 --prefix path-to-model --epoch 0`, in which
-`path-to-model + '%4d' % epoch.params` will be the params file and
-`path-to-model + '-symbol.json'` will be the symbol json.
-* Demo can be run in cpu, modify `demo.py` accordingly.
-```
-usage: demo.py [-h] [--prefix PREFIX] [--epoch EPOCH] [--gpu GPU_ID]
+  ```
+  Pascal VOCdevkit
+  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+  http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCdevkit_08-Jun-2007.tar
+  selective_search_data (by Ross Girshick)
+  Download link accessible at https://github.com/rbgirshick/fast-rcnn/blob/master/data/scripts/fetch_selective_search_data.sh
+  ```
 
-Demonstrate a Fast R-CNN network
+* Data Folder Structure (create a `data` folder if there is none)
 
-optional arguments:
-  -h, --help       show this help message and exit
-  --prefix PREFIX  new model prefix
-  --epoch EPOCH    epoch of pretrained model
-  --gpu GPU_ID     GPU device to test with
-```
+  ```
+  VOCdevkit
+  -- VOC + year (JPEG images and annotations)
+  -- results (will be created by evaluation)
+  ---- VOC + year
+  ------ main
+  -------- comp4_det_val_aeroplane.txt
+  selective_search_data
+  rpn_data (will be created by rpn)
+  cache (will be created by imdb)
+  ```
 
 ## Disclaimer
 This repository used code from [MXNet](https://github.com/dmlc/mxnet),
@@ -142,3 +83,12 @@ This repository used code from [MXNet](https://github.com/dmlc/mxnet),
 [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/),
 [ImageNet](http://image-net.org/). Model comes from
 [VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
+
+## References
+1. Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems. In Neural Information Processing Systems, Workshop on Machine Learning Systems, 2015
+2. Ross Girshick. "Fast R-CNN." In Proceedings of the IEEE International Conference on Computer Vision, 2015.
+3. Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. "Faster R-CNN: Towards real-time object detection with region proposal networks." In Advances in Neural Information Processing Systems, 2015.
+4. Yangqing Jia, Evan Shelhamer, Jeff Donahue, Sergey Karayev, Jonathan Long, Ross Girshick, Sergio Guadarrama, and Trevor Darrell. "Caffe: Convolutional architecture for fast feature embedding." In Proceedings of the ACM International Conference on Multimedia, 2014.
+5. Mark Everingham, Luc Van Gool, Christopher KI Williams, John Winn, and Andrew Zisserman. "The pascal visual object classes (voc) challenge." International journal of computer vision 88, no. 2 (2010): 303-338.
+6. Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. "ImageNet: A large-scale hierarchical image database." In Computer Vision and Pattern Recognition, IEEE Conference on, 2009.
+7. Karen Simonyan, and Andrew Zisserman. "Very deep convolutional networks for large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
index 768b1a7fe15a..fb110849663b 100644
--- a/example/rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -1,25 +1,73 @@
 import argparse
-import mxnet as mx
 import os
-from tools.load_model import load_param
-from rcnn.symbol import get_symbol_vgg_test
+import numpy as np
+import cv2
+
+import mxnet as mx
+
+from helper.processing.image_processing import resize, transform
+from helper.processing.nms import nms
+from rcnn.config import config
 from rcnn.detector import Detector
-from tools.demo_net import demo_net
+from rcnn.symbol import get_vgg_test
+from rcnn.tester import vis_all_detection
+from utils.load_model import load_param
 
 
 def get_net(prefix, epoch, ctx):
     args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-    sym = get_symbol_vgg_test()
+    sym = get_vgg_test()
     detector = Detector(sym, ctx, args, auxs)
     return detector
 
 
+CLASSES = ('__background__',
+           'aeroplane', 'bicycle', 'bird', 'boat',
+           'bottle', 'bus', 'car', 'cat', 'chair',
+           'cow', 'diningtable', 'dog', 'horse',
+           'motorbike', 'person', 'pottedplant',
+           'sheep', 'sofa', 'train', 'tvmonitor')
+
+
+def demo_net(detector, image_name):
+    """
+    wrapper for detector
+    :param detector: Detector
+    :param image_name: image name
+    :return: None
+    """
+    config.TEST.HAS_RPN = True
+    assert os.path.exists(image_name), image_name + ' not found'
+    im = cv2.imread(image_name)
+    im_array, im_scale = resize(im, config.SCALES[0], config.MAX_SIZE)
+    im_array = transform(im_array, config.PIXEL_MEANS)
+    im_info = np.array([[im_array.shape[2], im_array.shape[3], im_scale]], dtype=np.float32)
+
+    scores, boxes = detector.im_detect(im_array, im_info)
+
+    all_boxes = [[] for _ in CLASSES]
+    CONF_THRESH = 0.8
+    NMS_THRESH = 0.3
+    for cls in CLASSES:
+        cls_ind = CLASSES.index(cls)
+        cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)]
+        cls_scores = scores[:, cls_ind]
+        keep = np.where(cls_scores >= CONF_THRESH)[0]
+        cls_boxes = cls_boxes[keep, :]
+        cls_scores = cls_scores[keep]
+        dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32)
+        keep = nms(dets.astype(np.float32), NMS_THRESH)
+        all_boxes[cls_ind] = dets[keep, :]
+
+    boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))]
+    vis_all_detection(im_array, boxes_this_image, CLASSES, 0)
+
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='Demonstrate a Fast R-CNN network')
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=9, type=int)
+    parser = argparse.ArgumentParser(description='Demonstrate a Faster R-CNN network')
+    parser.add_argument('--image', dest='image', help='custom image', type=str)
+    parser.add_argument('--prefix', dest='prefix', help='saved model prefix', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model', type=int)
     parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
                         default=0, type=int)
     args = parser.parse_args()
@@ -29,5 +77,5 @@ def parse_args():
     args = parse_args()
     ctx = mx.gpu(args.gpu_id)
     detector = get_net(args.prefix, args.epoch, ctx)
-    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '000004'))
-    demo_net(detector, os.path.join(os.getcwd(), 'data', 'demo', '001551'))
+    demo_net(detector, args.image)
+    demo_net(detector, args.image)
diff --git a/example/rcnn/helper/dataset/imdb.py b/example/rcnn/helper/dataset/imdb.py
index 3c431ff5bfd1..8f53ce5412e6 100644
--- a/example/rcnn/helper/dataset/imdb.py
+++ b/example/rcnn/helper/dataset/imdb.py
@@ -32,7 +32,7 @@ def roidb(self, gt_roidb):
     def create_roidb_from_box_list(self, box_list, gt_roidb):
         """
         given ground truth, prepare roidb
-        :param box_list: [image_index][box_index][x1, x2, y1, y2]
+        :param box_list: [image_index] ndarray of [box_index][x1, x2, y1, y2]
         :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         """
@@ -43,7 +43,7 @@ def create_roidb_from_box_list(self, box_list, gt_roidb):
             num_boxes = boxes.shape[0]
             overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
 
-            if gt_roidb is not None:
+            if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
                 gt_boxes = gt_roidb[i]['boxes']
                 gt_classes = gt_roidb[i]['gt_classes']
                 # n boxes and k gt_boxes => n * k overlap
@@ -106,5 +106,86 @@ def append_flipped_images(self, roidb):
         self.image_set_index *= 2
         return roidb
 
+    def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None, area='all', limit=None):
+        """
+        evaluate detection proposal recall metrics
+        record max overlap value for each gt box; return vector of overlap values
+        :param roidb: used to evaluate
+        :param candidate_boxes: if not given, use roidb's non-gt boxes
+        :param thresholds: array-like recall threshold
+        :param area: index in area ranges
+        :param limit: limit of bounding box evaluated
+        :return: None
+        ar: average recall, recalls: vector recalls at each IoU overlap threshold
+        thresholds: vector of IoU overlap threshold, gt_overlaps: vector of all ground-truth overlaps
+        """
+        areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3,
+                 '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
+        area_ranges = [[0**2, 1e5**2], [0**2, 32**2], [32**2, 96**2], [96**2, 1e5**2],
+                       [96**2, 128**2], [128**2, 256**2], [256**2, 512**2], [512**2, 1e5**2]]
+        assert areas.has_key(area), 'unknown area range: {}'.format(area)
+        area_range = area_ranges[areas[area]]
+        gt_overlaps = np.zeros(0)
+        num_pos = 0
+        for i in range(self.num_images):
+            # check for max_overlaps == 1 avoids including crowd annotations
+            max_gt_overlaps = roidb[i]['gt_overlaps'].toarray().max(axis=1)
+            gt_inds = np.where((roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0]
+            gt_boxes = roidb[i]['boxes'][gt_inds, :]
+            gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
+            valid_gt_inds = np.where((gt_areas >= area_range[0]) & (gt_areas <= area_range[1]))[0]
+            gt_boxes = gt_boxes[valid_gt_inds, :]
+            num_pos += len(valid_gt_inds)
+
+            if candidate_boxes is None:
+                # default is use the non-gt boxes from roidb
+                non_gt_inds = np.where(roidb[i]['gt_classes'] == 0)[0]
+                boxes = roidb[i]['boxes'][non_gt_inds, :]
+            else:
+                boxes = candidate_boxes[i]
+            if boxes.shape[0] == 0:
+                continue
+            if limit is not None and boxes.shape[0] > limit:
+                boxes = boxes[:limit, :]
+
+            overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float))
+
+            _gt_overlaps = np.zeros((gt_boxes.shape[0]))
+            for j in range(gt_boxes.shape[0]):
+                # find which proposal maximally covers each gt box
+                argmax_overlaps = overlaps.argmax(axis=0)
+                # get the IoU amount of coverage for each gt box
+                max_overlaps = overlaps.max(axis=0)
+                # find which gt box is covered by most IoU
+                gt_ind = max_overlaps.argmax()
+                gt_ovr = max_overlaps.max()
+                assert (gt_ovr >= 0)
+                # find the proposal box that covers the best covered gt box
+                box_ind = argmax_overlaps[gt_ind]
+                # record the IoU coverage of this gt box
+                _gt_overlaps[j] = overlaps[box_ind, gt_ind]
+                assert (_gt_overlaps[j] == gt_ovr)
+                # mark the proposal box and the gt box as used
+                overlaps[box_ind, :] = -1
+                overlaps[:, gt_ind] = -1
+            # append recorded IoU coverage level
+            gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
+
+        gt_overlaps = np.sort(gt_overlaps)
+        if thresholds is None:
+            step = 0.05
+            thresholds = np.arange(0.5, 0.95 + 1e-5, step)
+        recalls = np.zeros_like(thresholds)
+
+        # compute recall for each IoU threshold
+        for i, t in enumerate(thresholds):
+            recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
+        ar = recalls.mean()
+
+        # print results
+        print 'average recall: {:.3f}'.format(ar)
+        for threshold, recall in zip(thresholds, recalls):
+            print 'recall @{:.2f}: {:.3f}'.format(threshold, recall)
+
     def evaluate_detections(self, detections):
         raise NotImplementedError
diff --git a/example/rcnn/helper/dataset/pascal_voc.py b/example/rcnn/helper/dataset/pascal_voc.py
index 7d2356ba3eef..9ae27f4b91e5 100644
--- a/example/rcnn/helper/dataset/pascal_voc.py
+++ b/example/rcnn/helper/dataset/pascal_voc.py
@@ -13,6 +13,7 @@
 import cPickle
 from imdb import IMDB
 from voc_eval import voc_eval
+from helper.processing.bbox_process import unique_boxes, filter_small_boxes
 
 
 class PascalVOC(IMDB):
@@ -43,7 +44,8 @@ def __init__(self, image_set, year, root_path, devkit_path):
         self.num_images = len(self.image_set_index)
 
         self.config = {'comp_id': 'comp4',
-                       'use_diff': True}
+                       'use_diff': False,
+                       'min_size': 2}
 
     @property
     def cache_path(self):
@@ -102,17 +104,14 @@ def load_pascal_annotation(self, index):
         :param index: index of a specific image
         :return: record['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         """
-        import xml.dom.minidom as minidom
+        import xml.etree.ElementTree as ET
         filename = os.path.join(self.data_path, 'Annotations', index + '.xml')
 
-        # print 'Loading: {}'.format(filename)
-        def get_data_from_tag(node, tag):
-            return node.getElementsByTagName(tag)[0].childNodes[0].data
-
-        with open(filename) as f:
-            data = minidom.parseString(f.read())
-
-        objs = data.getElementsByTagName('object')
+        tree = ET.parse(filename)
+        objs = tree.findall('object')
+        if not self.config['use_diff']:
+            non_diff_objs = [obj for obj in objs if int(obj.find('difficult').text) == 0]
+            objs = non_diff_objs
         num_objs = len(objs)
 
         boxes = np.zeros((num_objs, 4), dtype=np.uint16)
@@ -122,13 +121,13 @@ def get_data_from_tag(node, tag):
         class_to_index = dict(zip(self.classes, range(self.num_classes)))
         # Load object bounding boxes into a data frame.
         for ix, obj in enumerate(objs):
+            bbox = obj.find('bndbox')
             # Make pixel indexes 0-based
-            x1 = float(get_data_from_tag(obj, 'xmin')) - 1
-            y1 = float(get_data_from_tag(obj, 'ymin')) - 1
-            x2 = float(get_data_from_tag(obj, 'xmax')) - 1
-            y2 = float(get_data_from_tag(obj, 'ymax')) - 1
-            cls = class_to_index[
-                str(get_data_from_tag(obj, "name")).lower().strip()]
+            x1 = float(bbox.find('xmin').text) - 1
+            y1 = float(bbox.find('ymin').text) - 1
+            x2 = float(bbox.find('xmax').text) - 1
+            y2 = float(bbox.find('ymax').text) - 1
+            cls = class_to_index[obj.find('name').text.lower().strip()]
             boxes[ix, :] = [x1, y1, x2, y2]
             gt_classes[ix] = cls
             overlaps[ix, cls] = 1.0
@@ -155,7 +154,12 @@ def load_selective_search_roidb(self, gt_roidb):
 
         box_list = []
         for i in range(raw_data.shape[0]):
-            box_list.append(raw_data[i][:, (1, 0, 3, 2)] - 1)  # pascal voc dataset starts from 1.
+            boxes = raw_data[i][:, (1, 0, 3, 2)] - 1  # pascal voc dataset starts from 1.
+            keep = unique_boxes(boxes)
+            boxes = boxes[keep, :]
+            keep = filter_small_boxes(boxes, self.config['min_size'])
+            boxes = boxes[keep, :]
+            box_list.append(boxes)
 
         return self.create_roidb_from_box_list(box_list, gt_roidb)
 
@@ -183,6 +187,33 @@ def selective_search_roidb(self, gt_roidb):
 
         return roidb
 
+    def load_rpn_roidb(self, gt_roidb):
+        """
+        turn rpn detection boxes into roidb
+        :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
+        """
+        rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_rpn.pkl')
+        print 'loading {}'.format(rpn_file)
+        assert os.path.exists(rpn_file), 'rpn data not found at {}'.format(rpn_file)
+        with open(rpn_file, 'rb') as f:
+            box_list = cPickle.load(f)
+        return self.create_roidb_from_box_list(box_list, gt_roidb)
+
+    def rpn_roidb(self, gt_roidb):
+        """
+        get rpn roidb and ground truth roidb
+        :param gt_roidb: ground truth roidb
+        :return: roidb of rpn (ground truth included)
+        """
+        if self.image_set != 'test':
+            rpn_roidb = self.load_rpn_roidb(gt_roidb)
+            roidb = IMDB.merge_roidbs(gt_roidb, rpn_roidb)
+        else:
+            print 'rpn database need not be used in test'
+            roidb = self.load_rpn_roidb(gt_roidb)
+        return roidb
+
     def evaluate_detections(self, detections):
         """
         top level evaluations
diff --git a/example/rcnn/helper/dataset/voc_eval.py b/example/rcnn/helper/dataset/voc_eval.py
index 3b2c153c0de5..8975b619b708 100644
--- a/example/rcnn/helper/dataset/voc_eval.py
+++ b/example/rcnn/helper/dataset/voc_eval.py
@@ -95,7 +95,6 @@ def voc_eval(detpath, annopath, imageset_file, classname, cache_dir, ovthresh=0.
     else:
         with open(cache_file, 'r') as f:
             recs = cPickle.load(f)
-        print 'ground truth annotations loaded from cache file {}'.format(cache_file)
 
     # extract objects in :param classname:
     class_recs = {}
diff --git a/example/rcnn/helper/processing/bbox_process.py b/example/rcnn/helper/processing/bbox_process.py
new file mode 100644
index 000000000000..60d8a7af86bd
--- /dev/null
+++ b/example/rcnn/helper/processing/bbox_process.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+
+def unique_boxes(boxes, scale=1.0):
+    """ return indices of unique boxes """
+    v = np.array([1, 1e3, 1e6, 1e9])
+    hashes = np.round(boxes * scale).dot(v)
+    _, index = np.unique(hashes, return_index=True)
+    return np.sort(index)
+
+
+def filter_small_boxes(boxes, min_size):
+    w = boxes[:, 2] - boxes[:, 0]
+    h = boxes[:, 3] - boxes[:, 1]
+    keep = np.where((w >= min_size) & (h > min_size))[0]
+    return keep
diff --git a/example/rcnn/helper/processing/bbox_regression.py b/example/rcnn/helper/processing/bbox_regression.py
index 7e58324fc541..840a96cc5ec5 100644
--- a/example/rcnn/helper/processing/bbox_regression.py
+++ b/example/rcnn/helper/processing/bbox_regression.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from rcnn.config import config
+from bbox_transform import bbox_transform
 
 
 def bbox_overlaps(boxes, query_boxes):
@@ -43,6 +44,8 @@ def compute_bbox_regression_targets(rois, overlaps, labels):
 
     # Indices of ground-truth ROIs
     gt_inds = np.where(overlaps == 1)[0]
+    if len(gt_inds) == 0:
+        print 'something wrong : zero ground truth rois'
     # Indices of examples for which we try to make predictions
     ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
 
@@ -55,27 +58,9 @@ def compute_bbox_regression_targets(rois, overlaps, labels):
     gt_rois = rois[gt_inds[gt_assignment], :]
     ex_rois = rois[ex_inds, :]
 
-    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + config['EPS']
-    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + config['EPS']
-    ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
-    ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
-
-    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + config['EPS']
-    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + config['EPS']
-    gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
-    gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
-
-    targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
-    targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
-    targets_dw = np.log(gt_widths / ex_widths)
-    targets_dh = np.log(gt_heights / ex_heights)
-
     targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
     targets[ex_inds, 0] = labels[ex_inds]
-    targets[ex_inds, 1] = targets_dx
-    targets[ex_inds, 2] = targets_dy
-    targets[ex_inds, 3] = targets_dw
-    targets[ex_inds, 4] = targets_dh
+    targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
     return targets
 
 
diff --git a/example/rcnn/helper/processing/bbox_transform.py b/example/rcnn/helper/processing/bbox_transform.py
index ba5187f2ab0c..0757a70eedd7 100644
--- a/example/rcnn/helper/processing/bbox_transform.py
+++ b/example/rcnn/helper/processing/bbox_transform.py
@@ -1,10 +1,37 @@
 """
-This file has functions about bounding box post processing.
+This file has functions about bounding box processing.
 """
 
 import numpy as np
 
 
+def bbox_transform(ex_rois, gt_rois):
+    """
+    compute bounding box regression targets from ex_rois to gt_rois
+    :param ex_rois: [N, 4]
+    :param gt_rois: [N, 4]
+    :return: [N, 4]
+    """
+    ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
+    ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
+    ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
+    ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
+
+    gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
+    gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
+    gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
+    gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
+
+    targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14)
+    targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14)
+    targets_dw = np.log(gt_widths / ex_widths)
+    targets_dh = np.log(gt_heights / ex_heights)
+
+    targets = np.vstack(
+        (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
+    return targets
+
+
 def bbox_pred(boxes, box_deltas):
     """
     Transform the set of class-agnostic boxes into class-specific boxes
@@ -17,10 +44,10 @@ def bbox_pred(boxes, box_deltas):
         return np.zeros((0, box_deltas.shape[1]))
 
     boxes = boxes.astype(np.float, copy=False)
-    widths = boxes[:, 2] - boxes[:, 0] + 1e-14
-    heights = boxes[:, 3] - boxes[:, 1] + 1e-14
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
 
     dx = box_deltas[:, 0::4]
     dy = box_deltas[:, 1::4]
@@ -34,13 +61,13 @@ def bbox_pred(boxes, box_deltas):
 
     pred_boxes = np.zeros(box_deltas.shape)
     # x1
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * (pred_w - 1.0)
     # y1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * (pred_h - 1.0)
     # x2
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * (pred_w - 1.0)
     # y2
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
 
     return pred_boxes
 
@@ -53,11 +80,11 @@ def clip_boxes(boxes, im_shape):
     :return: [N, 4* num_classes]
     """
     # x1 >= 0
-    boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
+    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
     # y1 >= 0
-    boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
+    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
     # x2 < im_shape[1]
-    boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
+    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
     # y2 < im_shape[0]
-    boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
+    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
     return boxes
diff --git a/example/rcnn/helper/processing/generate_anchor.py b/example/rcnn/helper/processing/generate_anchor.py
new file mode 100644
index 000000000000..8996a3aaab48
--- /dev/null
+++ b/example/rcnn/helper/processing/generate_anchor.py
@@ -0,0 +1,72 @@
+"""
+Generate base anchors on index 0
+"""
+
+import numpy as np
+
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2 ** np.arange(3, 6)):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in xrange(ratio_anchors.shape[0])])
+    return anchors
+
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
diff --git a/example/rcnn/helper/processing/roidb.py b/example/rcnn/helper/processing/roidb.py
index d68ddb94290c..7ad1b26c182f 100644
--- a/example/rcnn/helper/processing/roidb.py
+++ b/example/rcnn/helper/processing/roidb.py
@@ -4,6 +4,7 @@
 extended ['image', 'max_classes', 'max_overlaps', 'bbox_targets']
 """
 
+import cv2
 import numpy as np
 
 from bbox_regression import compute_bbox_regression_targets
@@ -17,8 +18,13 @@ def prepare_roidb(imdb, roidb):
     :param roidb: roidb
     :return: None
     """
+    print 'prepare roidb'
     for i in range(len(roidb)):  # image_index
         roidb[i]['image'] = imdb.image_path_from_index(imdb.image_set_index[i])
+        if config.TRAIN.ASPECT_GROUPING:
+            size = cv2.imread(roidb[i]['image']).shape
+            roidb[i]['height'] = size[0]
+            roidb[i]['width'] = size[1]
         gt_overlaps = roidb[i]['gt_overlaps'].toarray()
         max_overlaps = gt_overlaps.max(axis=1)
         max_classes = gt_overlaps.argmax(axis=1)
@@ -51,22 +57,27 @@ def add_bbox_regression_targets(roidb):
         max_classes = roidb[im_i]['max_classes']
         roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes)
 
-    # compute mean, std values
-    class_counts = np.zeros((num_classes, 1)) + config.EPS
-    sums = np.zeros((num_classes, 4))
-    squared_sums = np.zeros((num_classes, 4))
-    for im_i in range(num_images):
-        targets = roidb[im_i]['bbox_targets']
-        for cls in range(1, num_classes):
-            cls_indexes = np.where(targets[:, 0] == cls)[0]
-            if cls_indexes.size > 0:
-                class_counts[cls] += cls_indexes.size
-                sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
-                squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0)
+    if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED:
+        # use fixed / precomputed means and stds instead of empirical values
+        means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1))
+        stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1))
+    else:
+        # compute mean, std values
+        class_counts = np.zeros((num_classes, 1)) + config.EPS
+        sums = np.zeros((num_classes, 4))
+        squared_sums = np.zeros((num_classes, 4))
+        for im_i in range(num_images):
+            targets = roidb[im_i]['bbox_targets']
+            for cls in range(1, num_classes):
+                cls_indexes = np.where(targets[:, 0] == cls)[0]
+                if cls_indexes.size > 0:
+                    class_counts[cls] += cls_indexes.size
+                    sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0)
+                    squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0)
 
-    means = sums / class_counts
-    # var(x) = E(x^2) - E(x)^2
-    stds = np.sqrt(squared_sums / class_counts - means ** 2)
+        means = sums / class_counts
+        # var(x) = E(x^2) - E(x)^2
+        stds = np.sqrt(squared_sums / class_counts - means ** 2)
 
     # normalized targets
     for im_i in range(num_images):
diff --git a/example/rcnn/rcnn/config.py b/example/rcnn/rcnn/config.py
index 9ae69cb8adc1..fb9826c1b186 100644
--- a/example/rcnn/rcnn/config.py
+++ b/example/rcnn/rcnn/config.py
@@ -3,26 +3,65 @@
 
 config = edict()
 
+# image processing config
 config.EPS = 1e-14
 config.PIXEL_MEANS = np.array([[[123.68, 116.779, 103.939]]])
+config.SCALES = (600, )  # single scale training and testing
+config.MAX_SIZE = 1000
+
+# nms config
+config.USE_GPU_NMS = True
+config.GPU_ID = 0
 
 config.TRAIN = edict()
 
-config.TRAIN.SCALES = (600, )
-config.TRAIN.MAX_SIZE = 1000
+# R-CNN and RPN
+config.TRAIN.FINETUNE = False
+config.TRAIN.BATCH_SIZE = 128  # used in grad_scale
 
+# R-CNN
+config.TRAIN.HAS_RPN = False
+config.TRAIN.ASPECT_GROUPING = True
 config.TRAIN.BATCH_IMAGES = 2
-config.TRAIN.BATCH_SIZE = 128
 config.TRAIN.FG_FRACTION = 0.25
 config.TRAIN.FG_THRESH = 0.5
 config.TRAIN.BG_THRESH_HI = 0.5
 config.TRAIN.BG_THRESH_LO = 0.1
 
+# R-CNN bounding box regression
 config.TRAIN.BBOX_REGRESSION_THRESH = 0.5
 config.TRAIN.BBOX_INSIDE_WEIGHTS = np.array([1.0, 1.0, 1.0, 1.0])
 
+# RPN anchor loader
+config.TRAIN.RPN_BATCH_SIZE = 256
+config.TRAIN.RPN_FG_FRACTION = 0.5
+config.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
+config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
+config.TRAIN.RPN_CLOBBER_POSITIVES = False
+config.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
+config.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
+
+# used for end2end training
+# RPN proposal
+config.TRAIN.RPN_NMS_THRESH = 0.7
+config.TRAIN.RPN_PRE_NMS_TOP_N = 12000
+config.TRAIN.RPN_POST_NMS_TOP_N = 6000
+config.TRAIN.RPN_MIN_SIZE = 16
+# approximate bounding box regression
+config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = False
+config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0)
+config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2)
+
 config.TEST = edict()
 
-config.TEST.SCALES = (600, )
+# R-CNN testing
+config.TEST.HAS_RPN = False
+config.TEST.BATCH_IMAGES = 1
 config.TEST.NMS = 0.3
 config.TEST.DEDUP_BOXES = 1. / 16.
+
+# RPN proposal
+config.TEST.RPN_NMS_THRESH = 0.7
+config.TEST.RPN_PRE_NMS_TOP_N = 6000
+config.TEST.RPN_POST_NMS_TOP_N = 300
+config.TEST.RPN_MIN_SIZE = 16
diff --git a/example/rcnn/rcnn/data_iter.py b/example/rcnn/rcnn/data_iter.py
index 69d9d64a3b2b..765334b2090f 100644
--- a/example/rcnn/rcnn/data_iter.py
+++ b/example/rcnn/rcnn/data_iter.py
@@ -1,12 +1,10 @@
 import mxnet as mx
 import numpy as np
 import minibatch
-from mxnet.executor_manager import _split_input_slice
-from helper.processing.image_processing import tensor_vstack
 
 
 class ROIIter(mx.io.DataIter):
-    def __init__(self, roidb, ctx, batch_size=2, shuffle=False, mode='train', work_load_list=None):
+    def __init__(self, roidb, batch_size=2, shuffle=False, mode='train'):
         """
         This Iter will provide roi data to Fast R-CNN network
         :param roidb: must be preprocessed
@@ -17,11 +15,9 @@ def __init__(self, roidb, ctx, batch_size=2, shuffle=False, mode='train', work_l
         super(ROIIter, self).__init__()
 
         self.roidb = roidb
-        self.ctx = ctx
         self.batch_size = batch_size
         self.shuffle = shuffle
         self.mode = mode
-        self.work_load_list = work_load_list
         if self.mode != 'train':
             assert self.batch_size == 1
 
@@ -34,17 +30,16 @@ def __init__(self, roidb, ctx, batch_size=2, shuffle=False, mode='train', work_l
         self.data = None
         self.label = None
         self.get_batch()
+        self.data_name = self.data.keys()
+        self.label_name = self.label.keys()
 
     @property
     def provide_data(self):
-        return [('data', self.data[0].shape), ('rois', self.data[1].shape)]
+        return [(k, v.shape) for k, v in self.data.items()]
 
     @property
     def provide_label(self):
-        return [('cls_prob_label', self.label[0].shape),
-                ('bbox_loss_target', self.label[1].shape),
-                ('bbox_loss_inside_weight', self.label[2].shape),
-                ('bbox_loss_outside_weight', self.label[3].shape)]
+        return [(k, v.shape) for k, v in self.label.items()]
 
     def reset(self):
         self.cur = 0
@@ -58,13 +53,8 @@ def next(self):
         if self.iter_next():
             self.get_batch()
             self.cur += self.batch_size
-            if self.mode == 'train':
-                return mx.io.DataBatch(data=self.data, label=self.label,
-                                       pad=self.getpad(), index=self.getindex(),
-                                       provide_data=self.provide_data, provide_label=self.provide_label)
-            else:
-                return mx.io.DataBatch(data=self.data, label=self.label,
-                                       pad=self.getpad(), index=self.getindex())
+            return mx.io.DataBatch(data=self.data, label=self.label,
+                                   pad=self.getpad(), index=self.getindex())
         else:
             raise StopIteration
 
@@ -72,17 +62,17 @@ def getindex(self):
         return self.cur / self.batch_size
 
     def getpad(self):
-        if self.cur + self.batch_size > self.size:
-            return self.cur + self.batch_size - self.size
-        else:
-            return 0
+        return self.batch_size - self.size % self.batch_size
 
     def get_batch(self):
         if self.mode == 'train':
             self.batch = self._get_train_batch()
-            self.data = [mx.nd.array(self.batch['data']), mx.nd.array(self.batch['rois'])]
-            self.label = [mx.nd.array(self.batch['labels']), mx.nd.array(self.batch['bbox_targets']),
-                mx.nd.array(self.batch['bbox_inside_weights']), mx.nd.array(self.batch['bbox_outside_weights'])]
+            self.data = {'data': self.batch['data'],
+                         'rois': self.batch['rois']}
+            self.label = {'cls_prob_label': self.batch['labels'],
+                          'bbox_loss_target': self.batch['bbox_targets'],
+                          'bbox_loss_inside_weight': self.batch['bbox_inside_weights'],
+                          'bbox_loss_outside_weight': self.batch['bbox_outside_weights']}
         else:
             self.batch = self._get_test_batch()
             self.data = {'data': self.batch['data'],
@@ -94,34 +84,11 @@ def _get_train_batch(self):
         utilize minibatch sampling, e.g. 2 images and 64 rois per image
         :return: training batch (e.g. 128 samples)
         """
-        work_load_list = self.work_load_list
-        ctx = self.ctx
-        if work_load_list is None:
-            work_load_list = [1] * len(ctx)
-        assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
-            "Invalid settings for work load. "
-        slices = _split_input_slice(self.batch_size, work_load_list)
-
         cur_from = self.cur
-        cur_to = cur_from + self.batch_size
-        if cur_to <= self.size:
-            roidb = [self.roidb[i] for i in range(cur_from, cur_to)]
-        else:
-            pad = cur_to - self.size
-            roidb = self.roidb[cur_from:] + self.roidb[:pad]
-
-        batch_list = []
-        for islice in slices:
-            num_im = islice.stop - islice.start
-            iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
-            batch = minibatch.get_minibatch(iroidb, self.num_classes, self.ctx)
-            batch_list.append(batch)
-
-        all_batch = dict()
-        for key in batch_list[0].keys():
-            all_batch[key] = tensor_vstack([batch[key] for batch in batch_list])
-
-        return all_batch
+        cur_to = min(cur_from + self.batch_size, self.size)
+        roidb = [self.roidb[i] for i in range(cur_from, cur_to)]
+        batch = minibatch.get_minibatch(roidb, self.num_classes)
+        return batch
 
     def _get_test_batch(self):
         """
diff --git a/example/rcnn/rcnn/detector.py b/example/rcnn/rcnn/detector.py
index cc9787d3fff4..8e424c973108 100644
--- a/example/rcnn/rcnn/detector.py
+++ b/example/rcnn/rcnn/detector.py
@@ -16,15 +16,16 @@ def __init__(self, symbol, ctx=None,
         self.aux_params = aux_params
         self.executor = None
 
-    def im_detect(self, im_array, roi_array):
+    def im_detect(self, im_array, im_info=None, roi_array=None):
         """
         perform detection of designated im, box, must follow minibatch.get_testbatch format
         :param im_array: numpy.ndarray [b c h w]
+        :param im_info: numpy.ndarray [b 3]
         :param roi_array: numpy.ndarray [roi_num 5]
         :return: scores, pred_boxes
         """
         # remove duplicate feature rois
-        if config.TEST.DEDUP_BOXES > 0:
+        if config.TEST.DEDUP_BOXES > 0 and not config.TEST.HAS_RPN:
             roi_array = roi_array
             # rank roi by v .* (b, dx, dy, dw, dh)
             v = np.array([1, 1e3, 1e6, 1e9, 1e12])
@@ -33,27 +34,44 @@ def im_detect(self, im_array, roi_array):
             _, index, inv_index = np.unique(hashes, return_index=True, return_inverse=True)
             roi_array = roi_array[index, :]
 
-        self.arg_params['data'] = mx.nd.array(im_array, self.ctx)
-        self.arg_params['rois'] = mx.nd.array(roi_array, self.ctx)
-        arg_shapes, out_shapes, aux_shapes = \
-            self.symbol.infer_shape(data=self.arg_params['data'].shape, rois=self.arg_params['rois'].shape)
+        # fill in data
+        if config.TEST.HAS_RPN:
+            self.arg_params['data'] = mx.nd.array(im_array, self.ctx)
+            self.arg_params['im_info'] = mx.nd.array(im_info, self.ctx)
+            arg_shapes, out_shapes, aux_shapes = \
+                self.symbol.infer_shape(data=self.arg_params['data'].shape, im_info=self.arg_params['im_info'].shape)
+        else:
+            self.arg_params['data'] = mx.nd.array(im_array, self.ctx)
+            self.arg_params['rois'] = mx.nd.array(roi_array, self.ctx)
+            arg_shapes, out_shapes, aux_shapes = \
+                self.symbol.infer_shape(data=self.arg_params['data'].shape, rois=self.arg_params['rois'].shape)
+
+        # fill in label and aux
         arg_shapes_dict = {name: shape for name, shape in zip(self.symbol.list_arguments(), arg_shapes)}
         self.arg_params['cls_prob_label'] = mx.nd.zeros(arg_shapes_dict['cls_prob_label'], self.ctx)
-
         aux_names = self.symbol.list_auxiliary_states()
         self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
+
+        # execute
         self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=None,
                                          grad_req='null', aux_states=self.aux_params)
         output_dict = {name: nd for name, nd in zip(self.symbol.list_outputs(), self.executor.outputs)}
-
         self.executor.forward(is_train=False)
-        scores = output_dict['cls_prob_output'].asnumpy()
-        bbox_deltas = output_dict['bbox_pred_output'].asnumpy()
 
-        pred_boxes = bbox_pred(roi_array[:, 1:], bbox_deltas)
+        # save output
+        scores = output_dict['cls_prob_reshape_output'].asnumpy()[0]
+        bbox_deltas = output_dict['bbox_pred_reshape_output'].asnumpy()[0]
+        if config.TEST.HAS_RPN:
+            rois = output_dict['rois_output'].asnumpy()
+            rois = rois[:, 1:].copy()  # scale back
+        else:
+            rois = roi_array[:, 1:]
+
+        # post processing
+        pred_boxes = bbox_pred(rois, bbox_deltas)
         pred_boxes = clip_boxes(pred_boxes, im_array[0].shape[-2:])
 
-        if config.TEST.DEDUP_BOXES > 0:
+        if config.TEST.DEDUP_BOXES > 0 and not config.TEST.HAS_RPN:
             # map back to original
             scores = scores[inv_index, :]
             pred_boxes = pred_boxes[inv_index, :]
diff --git a/example/rcnn/rcnn/loader.py b/example/rcnn/rcnn/loader.py
new file mode 100644
index 000000000000..cea0900245a3
--- /dev/null
+++ b/example/rcnn/rcnn/loader.py
@@ -0,0 +1,298 @@
+import mxnet as mx
+import numpy as np
+import minibatch
+from config import config
+from mxnet.executor_manager import _split_input_slice
+from helper.processing.image_processing import tensor_vstack
+
+
+class ROIIter(mx.io.DataIter):
+    def __init__(self, roidb, batch_size=2, shuffle=False, mode='train', ctx=None, work_load_list=None):
+        """
+        This Iter will provide roi data to Fast R-CNN network
+        :param roidb: must be preprocessed
+        :param batch_size: must divide BATCH_SIZE(128)
+        :param shuffle: bool
+        :param mode: control returned info
+        :param ctx: list of contexts
+        :param work_load_list: list of work load
+        :return: ROIIter
+        """
+        super(ROIIter, self).__init__()
+
+        self.roidb = roidb
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.mode = mode
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = [mx.cpu()]
+        self.work_load_list = work_load_list
+
+        self.cur = 0
+        self.size = len(roidb)
+        self.index = np.arange(self.size)
+        self.num_classes = self.roidb[0]['gt_overlaps'].shape[1]
+        self.reset()
+
+        self.batch = None
+        self.data = None
+        self.label = None
+        self.get_batch()
+        self.data_name = ['data', 'rois']
+        self.label_name = ['label', 'bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']
+
+    @property
+    def provide_data(self):
+        if self.mode == 'train':
+            return [('data', self.data[0].shape), ('rois', self.data[1].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    @property
+    def provide_label(self):
+        if self.mode == 'train':
+            return [('label', self.label[0].shape),
+                    ('bbox_target', self.label[1].shape),
+                    ('bbox_inside_weight', self.label[2].shape),
+                    ('bbox_outside_weight', self.label[3].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    def reset(self):
+        self.cur = 0
+        if self.shuffle:
+            if config.TRAIN.ASPECT_GROUPING:
+                widths = np.array([r['width'] for r in self.roidb])
+                heights = np.array([r['height'] for r in self.roidb])
+                horz = (widths >= heights)
+                vert = np.logical_not(horz)
+                horz_inds = np.where(horz)[0]
+                vert_inds = np.where(vert)[0]
+                inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
+                inds = np.reshape(inds, (-1, 2))
+                row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                inds = np.reshape(inds[row_perm, :], (-1, ))
+                self.index = inds
+            else:
+                np.random.shuffle(self.index)
+
+    def iter_next(self):
+        return self.cur + self.batch_size <= self.size
+
+    def next(self):
+        if self.iter_next():
+            self.get_batch()
+            self.cur += self.batch_size
+            return mx.io.DataBatch(data=self.data, label=self.label,
+                                   pad=self.getpad(), index=self.getindex(),
+                                   provide_data=self.provide_data, provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def getindex(self):
+        return self.cur / self.batch_size
+
+    def getpad(self):
+        if self.cur + self.batch_size > self.size:
+            return self.cur + self.batch_size - self.size
+        else:
+            return 0
+
+    def get_batch(self):
+        cur_from = self.cur
+        cur_to = min(cur_from + self.batch_size, self.size)
+        roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)]
+        if self.mode == 'test':
+            self.data, self.label = minibatch.get_minibatch(roidb, self.num_classes, self.mode)
+        else:
+            work_load_list = self.work_load_list
+            ctx = self.ctx
+            if work_load_list is None:
+                work_load_list = [1] * len(ctx)
+            assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
+                "Invalid settings for work load. "
+            slices = _split_input_slice(self.batch_size, work_load_list)
+
+            data_list = []
+            label_list = []
+            for islice in slices:
+                iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
+                data, label = minibatch.get_minibatch(iroidb, self.num_classes, self.mode)
+                data_list.append(data)
+                label_list.append(label)
+
+            all_data = dict()
+            for key in data_list[0].keys():
+                all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+
+            all_label = dict()
+            for key in label_list[0].keys():
+                all_label[key] = tensor_vstack([batch[key] for batch in label_list])
+
+            self.data = [mx.nd.array(all_data['data']),
+                         mx.nd.array(all_data['rois'])]
+            self.label = [mx.nd.array(all_label['label']),
+                          mx.nd.array(all_label['bbox_target']),
+                          mx.nd.array(all_label['bbox_inside_weight']),
+                          mx.nd.array(all_label['bbox_outside_weight'])]
+
+
+class AnchorLoader(mx.io.DataIter):
+    def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, mode='train', ctx=None, work_load_list=None,
+                 feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2), allowed_border=0):
+        """
+        This Iter will provide roi data to Fast R-CNN network
+        :param feat_sym: to infer shape of assign_output
+        :param roidb: must be preprocessed
+        :param batch_size: must divide BATCH_SIZE(128)
+        :param shuffle: bool
+        :param mode: control returned info
+        :param ctx: list of contexts
+        :param work_load_list: list of work load
+        :return: AnchorLoader
+        """
+        super(AnchorLoader, self).__init__()
+
+        self.feat_sym = feat_sym
+        self.roidb = roidb
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.mode = mode
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = [mx.cpu()]
+        self.work_load_list = work_load_list
+        self.feat_stride = feat_stride
+        self.anchor_scales = anchor_scales
+        self.anchor_ratios = anchor_ratios
+        self.allowed_border = allowed_border
+
+        self.cur = 0
+        self.size = len(roidb)
+        self.index = np.arange(self.size)
+        self.num_classes = self.roidb[0]['gt_overlaps'].shape[1]
+        self.reset()
+
+        self.batch = None
+        self.data = None
+        self.label = None
+        self.get_batch()
+        self.data_name = ['data', 'im_info']
+        self.label_name = ['label', 'bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']
+
+    @property
+    def provide_data(self):
+        if self.mode == 'train':
+            return [('data', self.data[0].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    @property
+    def provide_label(self):
+        if self.mode == 'train':
+            return [('label', self.label[0].shape),
+                    ('bbox_target', self.label[1].shape),
+                    ('bbox_inside_weight', self.label[2].shape),
+                    ('bbox_outside_weight', self.label[3].shape)]
+        else:
+            return [(k, v.shape) for k, v in self.data.items()]
+
+    def reset(self):
+        self.cur = 0
+        if self.shuffle:
+            if config.TRAIN.ASPECT_GROUPING:
+                widths = np.array([r['width'] for r in self.roidb])
+                heights = np.array([r['height'] for r in self.roidb])
+                horz = (widths >= heights)
+                vert = np.logical_not(horz)
+                horz_inds = np.where(horz)[0]
+                vert_inds = np.where(vert)[0]
+                inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
+                inds = np.reshape(inds, (-1, 2))
+                row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                inds = np.reshape(inds[row_perm, :], (-1, ))
+                self.index = inds
+            else:
+                np.random.shuffle(self.index)
+
+    def iter_next(self):
+        return self.cur + self.batch_size <= self.size
+
+    def next(self):
+        if self.iter_next():
+            self.get_batch()
+            self.cur += self.batch_size
+            return mx.io.DataBatch(data=self.data, label=self.label,
+                                   pad=self.getpad(), index=self.getindex(),
+                                   provide_data=self.provide_data, provide_label=self.provide_label)
+        else:
+            raise StopIteration
+
+    def getindex(self):
+        return self.cur / self.batch_size
+
+    def getpad(self):
+        if self.cur + self.batch_size > self.size:
+            return self.cur + self.batch_size - self.size
+        else:
+            return 0
+
+    def get_batch(self):
+        cur_from = self.cur
+        cur_to = min(cur_from + self.batch_size, self.size)
+        roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)]
+        if self.mode == 'test':
+            self.data, self.label = minibatch.get_minibatch(roidb, self.num_classes, self.mode)
+        else:
+            work_load_list = self.work_load_list
+            ctx = self.ctx
+            if work_load_list is None:
+                work_load_list = [1] * len(ctx)
+            assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \
+                "Invalid settings for work load. "
+            slices = _split_input_slice(self.batch_size, work_load_list)
+
+            data_list = []
+            label_list = []
+            for islice in slices:
+                iroidb = [roidb[i] for i in range(islice.start, islice.stop)]
+                data, label = minibatch.get_minibatch(iroidb, self.num_classes, self.mode)
+                data_list.append(data)
+                label_list.append(label)
+
+            # pad data first and then assign anchor (read label)
+            data_tensor = tensor_vstack([batch['data'] for batch in data_list])
+            for data, data_pad in zip(data_list, data_tensor):
+                data['data'] = data_pad[np.newaxis, :]
+
+            new_label_list = []
+            for data, label in zip(data_list, label_list):
+                # infer label shape
+                data_shape = {k: v.shape for k, v in data.items()}
+                del data_shape['im_info']
+                _, feat_shape, _ = self.feat_sym.infer_shape(**data_shape)
+                feat_shape = [int(i) for i in feat_shape[0]]
+
+                # assign anchor for label
+                label = minibatch.assign_anchor(feat_shape, label['gt_boxes'], data['im_info'],
+                                                self.feat_stride, self.anchor_scales,
+                                                self.anchor_ratios, self.allowed_border)
+                del data['im_info']
+                new_label_list.append(label)
+
+            all_data = dict()
+            for key in ['data']:
+                all_data[key] = tensor_vstack([batch[key] for batch in data_list])
+
+            all_label = dict()
+            all_label['label'] = tensor_vstack([batch['label'] for batch in new_label_list], pad=-1)
+            for key in ['bbox_target', 'bbox_inside_weight', 'bbox_outside_weight']:
+                all_label[key] = tensor_vstack([batch[key] for batch in new_label_list])
+
+            self.data = [mx.nd.array(all_data['data'])]
+
+            self.label = [mx.nd.array(all_label['label']),
+                          mx.nd.array(all_label['bbox_target']),
+                          mx.nd.array(all_label['bbox_inside_weight']),
+                          mx.nd.array(all_label['bbox_outside_weight'])]
diff --git a/example/rcnn/rcnn/metric.py b/example/rcnn/rcnn/metric.py
index c31e5533c04b..b8bd90875604 100644
--- a/example/rcnn/rcnn/metric.py
+++ b/example/rcnn/rcnn/metric.py
@@ -4,14 +4,52 @@
 from rcnn.config import config
 
 
+class AccuracyMetric(mx.metric.EvalMetric):
+    def __init__(self, use_ignore=False, ignore=None):
+        super(AccuracyMetric, self).__init__('Accuracy')
+        self.use_ignore = use_ignore
+        self.ignore = ignore
+        self.has_rpn = config.TRAIN.HAS_RPN
+        if self.has_rpn:
+            assert self.use_ignore and self.ignore is not None
+
+    def update(self, labels, preds):
+        if self.has_rpn:
+            pred_label = mx.ndarray.argmax_channel(preds[0]).asnumpy().astype('int32')
+            label = labels[0].asnumpy().astype('int32')
+            non_ignore_inds = np.where(label != self.ignore)
+            pred_label = pred_label[non_ignore_inds]
+            label = label[non_ignore_inds]
+        else:
+            last_dim = preds[0].shape[-1]
+            pred_label = preds[0].asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32')
+            label = labels[0].asnumpy().reshape(-1,).astype('int32')
+
+        self.sum_metric += (pred_label.flat == label.flat).sum()
+        self.num_inst += len(pred_label.flat)
+
+
 class LogLossMetric(mx.metric.EvalMetric):
-    def __init__(self):
+    def __init__(self, use_ignore=False, ignore=None):
         super(LogLossMetric, self).__init__('LogLoss')
+        self.use_ignore = use_ignore
+        self.ignore = ignore
+        self.has_rpn = config.TRAIN.HAS_RPN
+        if self.has_rpn:
+            assert self.use_ignore and self.ignore is not None
 
     def update(self, labels, preds):
-        pred_cls = preds[0].asnumpy()
-        label = labels[0].asnumpy().astype('int32')
-        cls = pred_cls[np.arange(label.shape[0]), label]
+        if self.has_rpn:
+            pred_cls = preds[0].asnumpy()[0]
+            label = labels[0].asnumpy().astype('int32')[0]
+            non_ignore_inds = np.where(label != self.ignore)[0]
+            label = label[non_ignore_inds]
+            cls = pred_cls[label, non_ignore_inds]
+        else:
+            last_dim = preds[0].shape[-1]
+            pred_cls = preds[0].asnumpy().reshape(-1, last_dim)
+            label = labels[0].asnumpy().reshape(-1,).astype('int32')
+            cls = pred_cls[np.arange(label.shape[0]), label]
         cls += config.EPS
         cls_loss = -1 * np.log(cls)
         cls_loss = np.sum(cls_loss)
@@ -22,22 +60,15 @@ def update(self, labels, preds):
 class SmoothL1LossMetric(mx.metric.EvalMetric):
     def __init__(self):
         super(SmoothL1LossMetric, self).__init__('SmoothL1Loss')
+        self.has_rpn = config.TRAIN.HAS_RPN
 
     def update(self, labels, preds):
         bbox_loss = preds[1].asnumpy()
-        label = labels[1].asnumpy()
+        if self.has_rpn:
+            bbox_loss = bbox_loss.reshape((bbox_loss.shape[0], -1))
+        else:
+            first_dim = bbox_loss.shape[0] * bbox_loss.shape[1]
+            bbox_loss = bbox_loss.reshape(first_dim, -1)
+        self.num_inst += bbox_loss.shape[0]
         bbox_loss = np.sum(bbox_loss)
         self.sum_metric += bbox_loss
-        self.num_inst += label.shape[0]
-
-
-class Accuracy(mx.metric.EvalMetric):
-    def __init__(self):
-        super(Accuracy, self).__init__('accuracy')
-
-    def update(self, labels, preds):
-        pred_label = mx.ndarray.argmax_channel(preds[0]).asnumpy().astype('int32')
-        label = labels[0].asnumpy().astype('int32')
-
-        self.sum_metric += (pred_label.flat == label.flat).sum()
-        self.num_inst += len(pred_label.flat)
diff --git a/example/rcnn/rcnn/minibatch.py b/example/rcnn/rcnn/minibatch.py
index b160ff96700b..920d27eef22b 100644
--- a/example/rcnn/rcnn/minibatch.py
+++ b/example/rcnn/rcnn/minibatch.py
@@ -1,18 +1,24 @@
 """
 To construct data iterator from imdb, batch sampling procedure are defined here
-training minibatch =
+RPN:
+data =
     {'data': [num_images, c, h, w],
-    'rois': [num_rois, 5],
-    'labels': [num_rois],
-    'bbox_targets': [num_rois, 4 * num_classes],
-    'bbox_inside_weights': [num_rois, 4 * num_classes],
-    'bbox_outside_weights': [num_rois, 4 * num_classes]}
-    num_images should divide config['TRAIN_BATCH_SIZE'] and num_rois = config['TRAIN_BATCH_SIZE'] / num_images
-validation minibatch is similar except num_images = 1 and num_rois = all rois
-testing minibatch =
+    'im_info': [num_images, 4] (optional)}
+label =
+prototype: {'gt_boxes': [num_boxes, 5]}
+final:  {'label': [batch_size, 1] <- [batch_size, num_anchors, feat_height, feat_width],
+         'bbox_target': [batch_size, num_anchors, feat_height, feat_width],
+         'bbox_inside_weight': [batch_size, num_anchors, feat_height, feat_width],
+         'bbox_outside_weight': [batch_size, num_anchors, feat_height, feat_width]}
+Fast R-CNN:
+data =
     {'data': [num_images, c, h, w],
-    'rois': [num_rois, 5]}
-    num_images = 1 and num_rois = all rois
+    'rois': [num_images, num_rois, 5]}
+label =
+    {'label': [num_images, num_rois],
+    'bbox_target': [num_images, num_rois, 4 * num_classes],
+    'bbox_inside_weight': [num_images, num_rois, 4 * num_classes],
+    'bbox_outside_weight': [num_images, num_rois, 4 * num_classes]}
 """
 
 import cv2
@@ -21,85 +27,101 @@
 
 from helper.processing import image_processing
 from helper.processing.bbox_regression import expand_bbox_regression_targets
+from helper.processing.generate_anchor import generate_anchors
+from helper.processing.bbox_regression import bbox_overlaps
+from helper.processing.bbox_transform import bbox_transform
 from rcnn.config import config
 
-def get_minibatch(roidb, num_classes, ctx):
+
+def get_minibatch(roidb, num_classes, mode='test'):
     """
     return minibatch of images in roidb
-    :param roidb: subset of main database
-    :param num_classes: number of classes is used in bbox regression targets
-    :return: minibatch: {'data', 'rois', 'labels', 'bbox_targets', 'bbox_inside_weights', 'bbox_outside_weights'}
-    """
-    num_images = len(roidb)
-    random_scale_indexes = npr.randint(0, high=len(config.TRAIN.SCALES), size=num_images)
-    assert config.TRAIN.BATCH_SIZE % num_images == 0, \
-        'num_images {} must devide BATCHSIZE {}'.format(num_images, config.TRAIN.BATCH_SIZE)
-    rois_per_image = config.TRAIN.BATCH_SIZE / num_images
-    fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int)
-
-    # im_array: [num_images, c, h, w]
-    im_array, im_scales = get_image_array(roidb, config.TRAIN.SCALES, random_scale_indexes)
-    rois_array = list()
-    labels_array = list()
-    bbox_targets_array = list()
-    bbox_inside_array = list()
-
-    for im_i in range(num_images):
-        im_rois, labels, bbox_targets, bbox_inside_weights, overlaps = \
-            sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes)
-
-        # project im_rois
-        # do not round roi
-        rois = im_rois * im_scales[im_i]
-        batch_index = im_i * np.ones((rois.shape[0], 1))
-        rois_array_this_image = np.hstack((batch_index, rois))
-        rois_array.append(rois_array_this_image)
-
-        # add labels
-        labels_array.append(labels)
-        bbox_targets_array.append(bbox_targets)
-        bbox_inside_array.append(bbox_inside_weights)
-
-    rois_array = np.vstack(rois_array)
-    labels_array = np.hstack(labels_array)
-    bbox_targets_array = np.vstack(bbox_targets_array)
-    bbox_inside_array = np.vstack(bbox_inside_array)
-    bbox_outside_array = np.array(bbox_inside_array > 0).astype(np.float32)
-
-    minibatch = {'data': im_array,
-                 'rois': rois_array,
-                 'labels': labels_array,
-                 'bbox_targets': bbox_targets_array,
-                 'bbox_inside_weights': bbox_inside_array,
-                 'bbox_outside_weights': bbox_outside_array}
-    return minibatch
-
-
-def get_testbatch(roidb, num_classes):
-    """
-    return test batch of given roidb
-    actually, there is only one testing scale and len(roidb) is 1
-    :param roidb: subset of main database
+    :param roidb: a list of dict, whose length controls batch size
     :param num_classes: number of classes is used in bbox regression targets
-    :return: minibatch: {'data', 'rois'}
+    :param mode: controls whether blank label are returned
+    :return: data, label
     """
+    # build im_array: [num_images, c, h, w]
     num_images = len(roidb)
-    random_scale_indexes = npr.randint(0, high=len(config.TEST.SCALES), size=num_images)
-    im_array, im_scales = get_image_array(roidb, config.TEST.SCALES, random_scale_indexes)
+    random_scale_indexes = npr.randint(0, high=len(config.SCALES), size=num_images)
+    im_array, im_scales = get_image_array(roidb, config.SCALES, random_scale_indexes)
+
+    if mode == 'train':
+        cfg_key = 'TRAIN'
+    else:
+        cfg_key = 'TEST'
+
+    if config[cfg_key].HAS_RPN:
+        assert len(roidb) == 1, 'Single batch only'
+        assert len(im_scales) == 1, 'Single batch only'
+        im_info = np.array([[im_array.shape[2], im_array.shape[3], im_scales[0]]], dtype=np.float32)
+
+        data = {'data': im_array,
+                'im_info': im_info}
+        label = {}
+
+        if mode == 'train':
+            # gt boxes: (x1, y1, x2, y2, cls)
+            gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
+            gt_boxes = np.empty((roidb[0]['boxes'].shape[0], 5), dtype=np.float32)
+            gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
+            gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
+            label = {'gt_boxes': gt_boxes}
+    else:
+        if mode == 'train':
+            assert config.TRAIN.BATCH_SIZE % config.TRAIN.BATCH_IMAGES == 0, \
+                'BATCHIMAGES {} must devide BATCHSIZE {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_SIZE)
+            rois_per_image = config.TRAIN.BATCH_SIZE / config.TRAIN.BATCH_IMAGES
+            fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int)
+
+            rois_array = list()
+            labels_array = list()
+            bbox_targets_array = list()
+            bbox_inside_array = list()
+
+            for im_i in range(num_images):
+                im_rois, labels, bbox_targets, bbox_inside_weights, overlaps = \
+                    sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes)
 
-    rois_array = list()
-    for im_i in range(num_images):
-        im_rois = roidb[im_i]['boxes']
-        rois = im_rois * im_scales[im_i]
-        batch_index = im_i * np.ones((rois.shape[0], 1))
-        rois_array_this_image = np.hstack((batch_index, rois))
-        rois_array.append(rois_array_this_image)
+                # project im_rois
+                # do not round roi
+                rois = im_rois * im_scales[im_i]
+                batch_index = im_i * np.ones((rois.shape[0], 1))
+                rois_array_this_image = np.hstack((batch_index, rois))
+                rois_array.append(rois_array_this_image)
 
-    rois_array = np.vstack(rois_array)
+                # add labels
+                labels_array.append(labels)
+                bbox_targets_array.append(bbox_targets)
+                bbox_inside_array.append(bbox_inside_weights)
 
-    testbatch = {'data': im_array,
-                 'rois': rois_array}
-    return testbatch
+            rois_array = np.array(rois_array)
+            labels_array = np.array(labels_array)
+            bbox_targets_array = np.array(bbox_targets_array)
+            bbox_inside_array = np.array(bbox_inside_array)
+            bbox_outside_array = np.array(bbox_inside_array > 0).astype(np.float32)
+
+            data = {'data': im_array,
+                    'rois': rois_array}
+            label = {'label': labels_array,
+                     'bbox_target': bbox_targets_array,
+                     'bbox_inside_weight': bbox_inside_array,
+                     'bbox_outside_weight': bbox_outside_array}
+        else:
+            rois_array = list()
+            for im_i in range(num_images):
+                im_rois = roidb[im_i]['boxes']
+                rois = im_rois * im_scales[im_i]
+                batch_index = im_i * np.ones((rois.shape[0], 1))
+                rois_array_this_image = np.hstack((batch_index, rois))
+                rois_array.append(rois_array_this_image)
+            rois_array = np.vstack(rois_array)
+
+            data = {'data': im_array,
+                    'rois': rois_array}
+            label = {}
+
+    return data, label
 
 
 def get_image_array(roidb, scales, scale_indexes):
@@ -118,7 +140,7 @@ def get_image_array(roidb, scales, scale_indexes):
         if roidb[i]['flipped']:
             im = im[:, ::-1, :]
         target_size = scales[scale_indexes[i]]
-        im, im_scale = image_processing.resize(im, target_size, config.TRAIN.MAX_SIZE)
+        im, im_scale = image_processing.resize(im, target_size, config.MAX_SIZE)
         im_tensor = image_processing.transform(im, config.PIXEL_MEANS)
         processed_ims.append(im_tensor)
         im_scales.append(im_scale)
@@ -177,3 +199,193 @@ def sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes):
         expand_bbox_regression_targets(roidb['bbox_targets'][keep_indexes, :], num_classes)
 
     return rois, labels, bbox_targets, bbox_inside_weights, overlaps
+
+
+def assign_anchor(feat_shape, gt_boxes, im_info, feat_stride=16,
+                  scales=(8, 16, 32), ratios=(0.5, 1, 2), allowed_border=0):
+    """
+    assign ground truth boxes to anchor positions
+    :param feat_shape: infer output shape
+    :param gt_boxes: assign ground truth
+    :param im_info: filter out anchors overlapped with edges
+    :param feat_stride: anchor position step
+    :param scales: used to generate anchors, affects num_anchors (per location)
+    :param ratios: aspect ratios of generated anchors
+    :param allowed_border: filter out anchors with edge overlap > allowed_border
+    :return: dict of label
+    'label': of shape (batch_size, 1) <- (batch_size, num_anchors, feat_height, feat_width)
+    'bbox_target': of shape (batch_size, num_anchors * 4, feat_height, feat_width)
+    'bbox_inside_weight': *todo* mark the assigned anchors
+    'bbox_outside_weight': used to normalize the bbox_loss, all weights sums to RPN_POSITIVE_WEIGHT
+    """
+    def _unmap(data, count, inds, fill=0):
+        """" unmap a subset inds of data into original data of size count """
+        if len(data.shape) == 1:
+            ret = np.empty((count,), dtype=np.float32)
+            ret.fill(fill)
+            ret[inds] = data
+        else:
+            ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
+            ret.fill(fill)
+            ret[inds, :] = data
+        return ret
+
+    def _compute_targets(ex_rois, gt_rois):
+        """ compute bbox targets for an image """
+        assert ex_rois.shape[0] == gt_rois.shape[0]
+        assert ex_rois.shape[1] == 4
+        assert gt_rois.shape[1] == 5
+
+        return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
+
+    DEBUG = False
+    im_info = im_info[0]
+    scales = np.array(scales, dtype=np.float32)
+    base_anchors = generate_anchors(base_size=16, ratios=list(ratios), scales=scales)
+    num_anchors = base_anchors.shape[0]
+    feat_height, feat_width = feat_shape[-2:]
+
+    if DEBUG:
+        print 'anchors:'
+        print base_anchors
+        print 'anchor shapes:'
+        print np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4],
+                         base_anchors[:, 3::4] - base_anchors[:, 1::4]))
+        print 'im_info', im_info
+        print 'height', feat_height, 'width', feat_width
+        print 'gt_boxes shape', gt_boxes.shape
+        print 'gt_boxes', gt_boxes
+
+    # 1. generate proposals from bbox deltas and shifted anchors
+    shift_x = np.arange(0, feat_width) * feat_stride
+    shift_y = np.arange(0, feat_height) * feat_stride
+    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
+    # add A anchors (1, A, 4) to
+    # cell K shifts (K, 1, 4) to get
+    # shift anchors (K, A, 4)
+    # reshape to (K*A, 4) shifted anchors
+    A = num_anchors
+    K = shifts.shape[0]
+    all_anchors = base_anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
+    all_anchors = all_anchors.reshape((K * A, 4))
+    total_anchors = int(K * A)
+
+    # only keep anchors inside the image
+    inds_inside = np.where((all_anchors[:, 0] >= -allowed_border) &
+                           (all_anchors[:, 1] >= -allowed_border) &
+                           (all_anchors[:, 2] < im_info[1] + allowed_border) &
+                           (all_anchors[:, 3] < im_info[0] + allowed_border))[0]
+    if DEBUG:
+        print 'total_anchors', total_anchors
+        print 'inds_inside', len(inds_inside)
+
+    # keep only inside anchors
+    anchors = all_anchors[inds_inside, :]
+    if DEBUG:
+        print 'anchors shape', anchors.shape
+
+    # label: 1 is positive, 0 is negative, -1 is dont care
+    labels = np.empty((len(inds_inside),), dtype=np.float32)
+    labels.fill(-1)
+
+    if gt_boxes.size > 0:
+        # overlap between the anchors and the gt boxes
+        # overlaps (ex, gt)
+        overlaps = bbox_overlaps(anchors.astype(np.float), gt_boxes.astype(np.float))
+        argmax_overlaps = overlaps.argmax(axis=1)
+        max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
+        gt_argmax_overlaps = overlaps.argmax(axis=0)
+        gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])]
+        gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
+
+        if not config.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels first so that positive labels can clobber them
+            labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+
+        # fg label: for each gt, anchor with highest overlap
+        labels[gt_argmax_overlaps] = 1
+
+        # fg label: above threshold IoU
+        labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1
+
+        if config.TRAIN.RPN_CLOBBER_POSITIVES:
+            # assign bg labels last so that negative labels can clobber positives
+            labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
+    else:
+        labels[:] = 0
+
+    # subsample positive labels if we have too many
+    num_fg = int(config.TRAIN.RPN_FG_FRACTION * config.TRAIN.RPN_BATCH_SIZE)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg:
+        disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
+        if DEBUG:
+            disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
+        labels[disable_inds] = -1
+
+    # subsample negative labels if we have too many
+    num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1)
+    bg_inds = np.where(labels == 0)[0]
+    if len(bg_inds) > num_bg:
+        disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
+        if DEBUG:
+            disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
+        labels[disable_inds] = -1
+
+    bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    if gt_boxes.size > 0:
+        bbox_targets[:] = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
+
+    bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    bbox_inside_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
+
+    bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
+    if config.TRAIN.RPN_POSITIVE_WEIGHT < 0:
+        # uniform weighting of exampling (given non-uniform sampling)
+        num_examples = np.sum(labels >= 0)
+        positive_weights = np.ones((1, 4)) * 1.0 / num_examples
+        negative_weights = np.ones((1, 4)) * 1.0 / num_examples
+    else:
+        assert ((config.TRAIN.RPN_POSTIVE_WEIGHT > 0) & (config.TRAIN.RPN_POSTIVE_WEIGHT < 1))
+        positive_weights = config.TRAIN.RPN_POSTIVE_WEIGHT / np.sum(labels == 1)
+        negative_weights = (1.0 - config.TRAIN.RPN_POSTIVE_WEIGHT) / np.sum(labels == 1)
+    bbox_outside_weights[labels == 1, :] = positive_weights
+    bbox_outside_weights[labels == 0, :] = negative_weights
+
+    if DEBUG:
+        _sums = bbox_targets[labels == 1, :].sum(axis=0)
+        _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0)
+        _counts = config.EPS + np.sum(labels == 1)
+        means = _sums / _counts
+        stds = np.sqrt(_squared_sums / _counts - means ** 2)
+        print 'means', means
+        print 'stdevs', stds
+
+    # map up to original set of anchors
+    labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
+    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
+    bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
+    bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
+
+    if DEBUG:
+        print 'rpn: max max_overlaps', np.max(max_overlaps)
+        print 'rpn: num_positives', np.sum(labels == 1)
+        print 'rpn: num_negatives', np.sum(labels == 0)
+        _fg_sum = np.sum(labels == 1)
+        _bg_sum = np.sum(labels == 0)
+        _count = 1
+        print 'rpn: num_positive avg', _fg_sum / _count
+        print 'rpn: num_negative avg', _bg_sum / _count
+
+    labels = labels.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
+    labels = labels.reshape((1, A * feat_height * feat_width))
+    bbox_targets = bbox_targets.reshape((1, feat_height, feat_width, A * 4)).transpose(0, 3, 1, 2)
+    bbox_inside_weights = bbox_inside_weights.reshape((1, feat_height, feat_width, A * 4)).transpose((0, 3, 1, 2))
+    bbox_outside_weights = bbox_outside_weights.reshape((1, feat_height, feat_width, A * 4)).transpose((0, 3, 1, 2))
+
+    label = {'label': labels,
+             'bbox_target': bbox_targets,
+             'bbox_inside_weight': bbox_inside_weights,
+             'bbox_outside_weight': bbox_outside_weights}
+    return label
diff --git a/example/rcnn/rcnn/module.py b/example/rcnn/rcnn/module.py
new file mode 100644
index 000000000000..6b5aef1d3d51
--- /dev/null
+++ b/example/rcnn/rcnn/module.py
@@ -0,0 +1,195 @@
+"""A `MutableModule` implement the `BaseModule` API, and allows input shape
+varying with training iterations. If shapes vary, executors will rebind,
+using shared arrays from the initial module binded with maximum shape.
+"""
+
+import logging
+
+from mxnet import context as ctx
+from mxnet.initializer import Uniform
+from mxnet.module.base_module import BaseModule
+from mxnet.module.module import Module
+
+class MutableModule(BaseModule):
+    """A mutable module is a module that supports variable input data.
+
+    Parameters
+    ----------
+    symbol : Symbol
+    data_names : list of str
+    label_names : list of str
+    logger : Logger
+    context : Context or list of Context
+    work_load_list : list of number
+    max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary
+    fixed_param_prefix : list of str, indicating fixed parameters
+    """
+    def __init__(self, symbol, data_names, label_names,
+                 logger=logging, context=ctx.cpu(), work_load_list=None,
+                 max_data_shapes=None, max_label_shapes=None, fixed_param_prefix=None):
+        super(MutableModule, self).__init__(logger=logger)
+        self._symbol = symbol
+        self._data_names = data_names
+        self._label_names = label_names
+        self._context = context
+        self._work_load_list = work_load_list
+
+        self._curr_module = None
+        self._max_data_shapes = max_data_shapes
+        self._max_label_shapes = max_label_shapes
+        self._fixed_param_prefix = fixed_param_prefix
+
+        if self._max_data_shapes is None:
+            self._max_data_shapes = []
+        if self._max_label_shapes is None:
+            self._max_label_shapes = []
+        if self._fixed_param_prefix is None:
+            self._fixed_param_prefix = []
+
+        fixed_param_names = list()
+        for name in self._symbol.list_arguments():
+            for prefix in self._fixed_param_prefix:
+                if prefix in name:
+                    fixed_param_names.append(name)
+        self._fixed_param_names = fixed_param_names
+
+    def _reset_bind(self):
+        self.binded = False
+        self._curr_module = None
+
+    @property
+    def data_names(self):
+        return self._data_names
+
+    @property
+    def output_names(self):
+        return self._symbol.list_outputs()
+
+    @property
+    def data_shapes(self):
+        assert self.binded
+        return self._curr_module.data_shapes
+
+    @property
+    def label_shapes(self):
+        assert self.binded
+        return self._curr_module.label_shapes
+
+    @property
+    def output_shapes(self):
+        assert self.binded
+        return self._curr_module.output_shapes
+
+    def get_params(self):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_params()
+
+    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
+                    allow_missing=False, force_init=False):
+        if self.params_initialized and not force_init:
+            return
+        assert self.binded, 'call bind before initializing the parameters'
+        self._curr_module.init_params(initializer=initializer, arg_params=arg_params,
+                                      aux_params=aux_params, allow_missing=allow_missing,
+                                      force_init=force_init)
+        self.params_initialized = True
+
+    def bind(self, data_shapes, label_shapes=None, for_training=True,
+             inputs_need_grad=False, force_rebind=False, shared_module=None):
+        # in case we already initialized params, keep it
+        if self.params_initialized:
+            arg_params, aux_params = self.get_params()
+
+        # force rebinding is typically used when one want to switch from
+        # training to prediction phase.
+        if force_rebind:
+            self._reset_bind()
+
+        if self.binded:
+            self.logger.warning('Already binded, ignoring bind()')
+            return
+
+        assert shared_module is None, 'shared_module for MutableModule is not supported'
+
+        self.for_training = for_training
+        self.inputs_need_grad = inputs_need_grad
+        self.binded = True
+
+        max_shapes_dict = dict(self._max_data_shapes + self._max_label_shapes)
+        max_data_shapes = list()
+        for name, shape in data_shapes:
+            if name in max_shapes_dict:
+                max_data_shapes.append((name, max_shapes_dict[name]))
+            else:
+                max_data_shapes.append((name, shape))
+        max_label_shapes = list()
+        for name, shape in label_shapes:
+            if name in max_shapes_dict:
+                max_label_shapes.append((name, max_shapes_dict[name]))
+            else:
+                max_label_shapes.append((name, shape))
+
+        module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger,
+                        context=self._context, work_load_list=self._work_load_list,
+                        fixed_param_names=self._fixed_param_names)
+        module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad,
+                    force_rebind=False, shared_module=None)
+        self._curr_module = module
+
+        # copy back saved params, if already initialized
+        if self.params_initialized:
+            self.set_params(arg_params, aux_params)
+
+    def init_optimizer(self, kvstore='local', optimizer='sgd',
+                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
+        assert self.binded and self.params_initialized
+        if self.optimizer_initialized and not force_init:
+            self.logger.warning('optimizer already initialized, ignoring.')
+            return
+
+        self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params,
+                                         force_init=force_init)
+        self.optimizer_initialized = True
+
+    def forward(self, data_batch, is_train=None):
+        assert self.binded and self.params_initialized
+
+        shape_changed = False
+        current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes)
+        input_shapes = dict(data_batch.provide_data + data_batch.provide_label)
+        for k, v in current_shapes.items():
+            if v != input_shapes[k]:
+                shape_changed = True
+
+        if shape_changed:
+            module = Module(self._symbol, self._data_names, self._label_names,
+                            logger=self.logger, context=self._context,
+                            work_load_list=self._work_load_list,
+                            fixed_param_names=self._fixed_param_names)
+            module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training,
+                        self._curr_module.inputs_need_grad, force_rebind=False,
+                        shared_module=self._curr_module)
+            self._curr_module = module
+
+        self._curr_module.forward(data_batch, is_train=is_train)
+
+    def backward(self, out_grads=None):
+        assert self.binded and self.params_initialized
+        self._curr_module.backward(out_grads=out_grads)
+
+    def update(self):
+        assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._curr_module.update()
+
+    def get_outputs(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_outputs(merge_multi_context=merge_multi_context)
+
+    def get_input_grads(self, merge_multi_context=True):
+        assert self.binded and self.params_initialized and self.inputs_need_grad
+        return self._curr_module.get_input_grads(merge_multi_context=merge_multi_context)
+
+    def update_metric(self, eval_metric, labels):
+        assert self.binded and self.params_initialized
+        self._curr_module.update_metric(eval_metric, labels)
diff --git a/example/rcnn/rcnn/rpn/__init__.py b/example/rcnn/rcnn/rpn/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/example/rcnn/rcnn/rpn/generate.py b/example/rcnn/rcnn/rpn/generate.py
new file mode 100644
index 000000000000..f1c8ddbef2e3
--- /dev/null
+++ b/example/rcnn/rcnn/rpn/generate.py
@@ -0,0 +1,116 @@
+import mxnet as mx
+import numpy as np
+import os
+import cPickle
+
+
+class Detector(object):
+    def __init__(self, symbol, ctx=None,
+                 arg_params=None, aux_params=None):
+        self.symbol = symbol
+        self.ctx = ctx
+        if self.ctx is None:
+            self.ctx = mx.cpu()
+        self.executor = None
+        self.arg_params = arg_params
+        self.aux_params = aux_params
+
+    def im_detect(self, im, im_info):
+        """
+        perform detection of im, im_info
+        :param im: numpy.ndarray [b, c, h, w]
+        :param im_info: numpy.ndarray [b, 3]
+        :return: boxes [b, 5], scores [b,]
+        """
+        self.arg_params['data'] = mx.nd.array(im, self.ctx)
+        self.arg_params['im_info'] = mx.nd.array(im_info, self.ctx)
+        arg_shapes, out_shapes, aux_shapes = \
+            self.symbol.infer_shape(data=self.arg_params['data'].shape, im_info=self.arg_params['im_info'].shape)
+        aux_names = self.symbol.list_auxiliary_states()
+        self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
+        self.executor = self.symbol.bind(self.ctx, self.arg_params, args_grad=None,
+                                         grad_req='null', aux_states=self.aux_params)
+        output_dict = {name: nd for name, nd in zip(self.symbol.list_outputs(), self.executor.outputs)}
+
+        self.executor.forward(is_train=False)
+        boxes = output_dict['rois_output'].asnumpy()
+        scores = output_dict['rois_score'].asnumpy()
+
+        return boxes, scores
+
+
+def generate_detections(detector, test_data, imdb, vis=False):
+    """
+    Generate detections results using RPN.
+    :param detector: Detector
+    :param test_data: data iterator, must be non-shuffled
+    :param imdb: image database
+    :param vis: controls visualization
+    :return: list of detected boxes
+    """
+    assert not test_data.shuffle
+
+    i = 0
+    imdb_boxes = list()
+    for databatch in test_data:
+        if i % 10 == 0:
+            print 'generating detections {}/{}'.format(i, imdb.num_images)
+
+        boxes, scores = detector.im_detect(databatch.data['data'], databatch.data['im_info'])
+        scale = databatch.data['im_info'][0, 2]
+        # drop the batch index
+        boxes = boxes[:, 1:].copy() / scale
+        imdb_boxes.append(boxes)
+        if vis:
+            dets = np.hstack((boxes * scale, scores))
+            vis_detection(databatch.data['data'], dets, thresh=0.9)
+        i += 1
+
+    assert len(imdb_boxes) == imdb.num_images, 'calculations not complete'
+    rpn_folder = os.path.join(imdb.root_path, 'rpn_data')
+    if not os.path.exists(rpn_folder):
+        os.mkdir(rpn_folder)
+    rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl')
+    with open(rpn_file, 'wb') as f:
+        cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL)
+    print 'wrote rpn proposals to {}'.format(rpn_file)
+    return imdb_boxes
+
+
+def vis_detection(im, dets, thresh=0.):
+    """
+    draw detected bounding boxes
+    :param im: [b, c, h, w] oin rgb
+    :param dets: only one class, [N * [4 coordinates score]]
+    :param thresh: thresh for valid detections
+    :return:
+    """
+    from rcnn.config import config
+    from helper.processing.image_processing import transform_inverse
+    import matplotlib.pyplot as plt
+    inds = np.where(dets[:, -1] >= thresh)[0]
+    if len(inds) == 0:
+        return
+    inds = np.argsort(dets[:, -1])[::-1]
+    inds = inds[:20]
+
+    class_name = 'obj'
+    fig, ax = plt.subplots(figsize=(12, 12))
+    im = transform_inverse(im, config.PIXEL_MEANS)
+    ax.imshow(im, aspect='equal')
+    for i in inds:
+        bbox = dets[i, :4]
+        score = dets[i, -1]
+        rect = plt.Rectangle((bbox[0], bbox[1]),
+                             bbox[2] - bbox[0],
+                             bbox[3] - bbox[1], fill=False,
+                             edgecolor='red', linewidth=3.5)
+        ax.add_patch(rect)
+        ax.text(bbox[0], bbox[1] - 2,
+                '{:s} {:3f}'.format(class_name, score),
+                bbox=dict(facecolor='blue', alpha=0.5), fontsize=14, color='white')
+    ax.set_title('{} detections with p({} | box) >= {:.1f}'.format(class_name, class_name, thresh), fontsize=14)
+    plt.axis('off')
+    plt.tight_layout()
+    plt.draw()
+    plt.show()
diff --git a/example/rcnn/rcnn/rpn/proposal.py b/example/rcnn/rcnn/rpn/proposal.py
new file mode 100644
index 000000000000..b0303c5cfd84
--- /dev/null
+++ b/example/rcnn/rcnn/rpn/proposal.py
@@ -0,0 +1,206 @@
+"""
+Proposal Operator transform anchor coordinates into ROI coordinates with prediction results on
+classification probability and bounding box prediction results, and image size and scale information.
+"""
+
+import mxnet as mx
+import numpy as np
+import numpy.random as npr
+
+from rcnn.config import config
+from helper.processing.generate_anchor import generate_anchors
+from helper.processing.bbox_transform import bbox_pred, clip_boxes
+from helper.processing.nms import nms
+
+DEBUG = False
+
+
+class ProposalOperator(mx.operator.CustomOp):
+    def __init__(self, feat_stride, scales, ratios, is_train=False, output_score=False):
+        super(ProposalOperator, self).__init__()
+        self._feat_stride = float(feat_stride)
+        self._scales = np.fromstring(scales[1:-1], dtype=float, sep=',')
+        self._ratios = np.fromstring(ratios[1:-1], dtype=float, sep=',').tolist()
+        self._anchors = generate_anchors(base_size=self._feat_stride, scales=self._scales, ratios=self._ratios)
+        self._num_anchors = self._anchors.shape[0]
+        self._output_score = output_score
+
+        if DEBUG:
+            print 'feat_stride: {}'.format(self._feat_stride)
+            print 'anchors:'
+            print self._anchors
+
+        if is_train:
+            self.cfg_key = 'TRAIN'
+        else:
+            self.cfg_key = 'TEST'
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        # for each (H, W) location i
+        #   generate A anchor boxes centered on cell i
+        #   apply predicted bbox deltas at cell i to each of the A anchors
+        # clip predicted boxes to image
+        # remove predicted boxes with either height or width < threshold
+        # sort all (proposal, score) pairs by score from highest to lowest
+        # take top pre_nms_topN proposals before NMS
+        # apply NMS with threshold 0.7 to remaining proposals
+        # take after_nms_topN proposals after NMS
+        # return the top proposals (-> RoIs top, scores top)
+
+        pre_nms_topN = config[self.cfg_key].RPN_PRE_NMS_TOP_N
+        post_nms_topN = config[self.cfg_key].RPN_POST_NMS_TOP_N
+        nms_thresh = config[self.cfg_key].RPN_NMS_THRESH
+        min_size = config[self.cfg_key].RPN_MIN_SIZE
+
+        # the first set of anchors are background probabilities
+        # keep the second part
+        scores = in_data[0].asnumpy()[:, self._num_anchors:, :, :]
+        bbox_deltas = in_data[1].asnumpy()
+        im_info = in_data[2].asnumpy()[0, :]
+
+        if DEBUG:
+            print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
+            print 'scale: {}'.format(im_info[2])
+
+        # 1. Generate proposals from bbox_deltas and shifted anchors
+        height, width = scores.shape[-2:]
+
+        if DEBUG:
+            print 'score map size: {}'.format(scores.shape)
+
+        # Enumerate all shifts
+        shift_x = np.arange(0, width) * self._feat_stride
+        shift_y = np.arange(0, height) * self._feat_stride
+        shift_x, shift_y = np.meshgrid(shift_x, shift_y)
+        shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
+
+        # Enumerate all shifted anchors:
+        #
+        # add A anchors (1, A, 4) to
+        # cell K shifts (K, 1, 4) to get
+        # shift anchors (K, A, 4)
+        # reshape to (K*A, 4) shifted anchors
+        A = self._num_anchors
+        K = shifts.shape[0]
+        anchors = self._anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
+        anchors = anchors.reshape((K * A, 4))
+
+        # Transpose and reshape predicted bbox transformations to get them
+        # into the same order as the anchors:
+        #
+        # bbox deltas will be (1, 4 * A, H, W) format
+        # transpose to (1, H, W, 4 * A)
+        # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
+        # in slowest to fastest order
+        bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
+
+        # Same story for the scores:
+        #
+        # scores are (1, A, H, W) format
+        # transpose to (1, H, W, A)
+        # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
+        scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+        # Convert anchors into proposals via bbox transformations
+        proposals = bbox_pred(anchors, bbox_deltas)
+
+        # 2. clip predicted boxes to image
+        proposals = clip_boxes(proposals, im_info[:2])
+
+        # 3. remove predicted boxes with either height or width < threshold
+        # (NOTE: convert min_size to input image scale stored in im_info[2])
+        keep = ProposalOperator._filter_boxes(proposals, min_size * im_info[2])
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # 4. sort all (proposal, score) pairs by score from highest to lowest
+        # 5. take top pre_nms_topN (e.g. 6000)
+        order = scores.ravel().argsort()[::-1]
+        if pre_nms_topN > 0:
+            order = order[:pre_nms_topN]
+        proposals = proposals[order, :]
+        scores = scores[order]
+
+        # 6. apply nms (e.g. threshold = 0.7)
+        # 7. take after_nms_topN (e.g. 300)
+        # 8. return the top proposals (-> RoIs top)
+        keep = nms(np.hstack((proposals, scores)), nms_thresh)
+        if post_nms_topN > 0:
+            keep = keep[:post_nms_topN]
+        # pad to ensure output size remains unchanged
+        if len(keep) < post_nms_topN:
+            pad = npr.choice(keep, size=post_nms_topN - len(keep))
+            keep = np.hstack((keep, pad))
+        proposals = proposals[keep, :]
+        scores = scores[keep]
+
+        # Output rois array
+        # Our RPN implementation only supports a single input image, so all
+        # batch inds are 0
+        batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
+        blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
+        self.assign(out_data[0], req[0], blob)
+
+        if self._output_score:
+            self.assign(out_data[1], req[1], scores.astype(np.float32, copy=False))
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        pass
+
+    @staticmethod
+    def _filter_boxes(boxes, min_size):
+        """ Remove all boxes with any side smaller than min_size """
+        ws = boxes[:, 2] - boxes[:, 0] + 1
+        hs = boxes[:, 3] - boxes[:, 1] + 1
+        keep = np.where((ws >= min_size) & (hs >= min_size))[0]
+        return keep
+
+
+@mx.operator.register("proposal")
+class ProposalProp(mx.operator.CustomOpProp):
+    def __init__(self, feat_stride, scales, ratios, is_train=False, output_score=False):
+        super(ProposalProp, self).__init__(need_top_grad=False)
+        self._feat_stride = feat_stride
+        self._scales = scales
+        self._ratios = ratios
+        self._is_train = is_train
+        self._output_score = output_score
+
+        if self._is_train:
+            self.cfg_key = 'TRAIN'
+        else:
+            self.cfg_key = 'TEST'
+
+    def list_arguments(self):
+        return ['cls_prob', 'bbox_pred', 'im_info']
+
+    def list_outputs(self):
+        if self._output_score:
+            return ['output', 'score']
+        else:
+            return ['output']
+
+    def infer_shape(self, in_shape):
+        cfg_key = self.cfg_key
+        cls_prob_shape = in_shape[0]
+        bbox_pred_shape = in_shape[1]
+        assert cls_prob_shape[0] == bbox_pred_shape[0], 'ROI number does not equal in cls and reg'
+
+        batch_size = cls_prob_shape[0]
+        if batch_size > 1:
+            raise ValueError("Only single item batches are supported")
+
+        im_info_shape = (batch_size, 3)
+        output_shape = (config[cfg_key].RPN_POST_NMS_TOP_N, 5)
+        score_shape = (config[cfg_key].RPN_POST_NMS_TOP_N, 1)
+
+        if self._output_score:
+            return [cls_prob_shape, bbox_pred_shape, im_info_shape], [output_shape, score_shape]
+        else:
+            return [cls_prob_shape, bbox_pred_shape, im_info_shape], [output_shape]
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return ProposalOperator(self._feat_stride, self._scales, self._ratios, self._is_train, self._output_score)
+
+    def declare_backward_dependency(self, out_grad, in_data, out_data):
+        return []
diff --git a/example/rcnn/rcnn/solver.py b/example/rcnn/rcnn/solver.py
deleted file mode 100644
index f59e9422b1c6..000000000000
--- a/example/rcnn/rcnn/solver.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import mxnet as mx
-import logging
-import metric
-
-from collections import namedtuple
-from callback import Speedometer
-from config import config
-
-class Solver(object):
-    def __init__(self, prefix,
-                 symbol, ctx=None,
-                 begin_epoch=0, num_epoch=None,
-                 kv_store='local',
-                 arg_params=None, aux_params=None,
-                 optimizer='sgd',
-                 max_data_shape=None, **kwargs):
-        self.prefix = prefix
-        self.symbol = symbol
-        self.ctx = ctx
-        if self.ctx is None:
-            self.ctx = mx.cpu()
-        self.begin_epoch = begin_epoch
-        self.num_epoch = num_epoch
-        self.kv_store = kv_store
-        self.arg_params = arg_params
-        self.aux_params = aux_params
-        self.optimizer = optimizer
-        self.updater = None
-        self.max_data_shape = max_data_shape
-        self.kwargs = kwargs.copy()
-
-        self.arg_names = None
-        self.param_names = None
-        self.aux_names = None
-
-    def get_params(self, grad_req):
-        arg_names = self.symbol.list_arguments()
-        self.arg_names = arg_names
-        arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
-        if grad_req != 'null':
-            param_names = []
-            for name, shape in zip(arg_names, arg_shapes):
-                if not (name.endswith('data') or name.endswith('rois') or
-                        name.endswith('inside_weight') or name.endswith('outside_weight') or
-                        name.endswith('label') or name.endswith('target') or
-                        name.startswith('conv1') or name.startswith('conv2')):
-                    param_names.append(name)
-            self.param_names = list(param_names)
-        aux_names = self.symbol.list_auxiliary_states()
-        self.aux_names = aux_names
-        self.aux_params = {k: mx.nd.zeros(s, self.ctx) for k, s in zip(aux_names, aux_shapes)}
-
-    def fit(self, train_data,
-            grad_req='write',
-            frequent=20,
-            logger=None):
-        (kvstore, update_on_kvstore) = mx.model._create_kvstore(
-            self.kv_store, len(self.ctx), self.arg_params)
-        if logger is None:
-            logger = logging
-        logger.info('Start training with %s', str(self.ctx))
-
-        batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
-        epoch_end_callback = mx.callback.do_checkpoint(self.prefix)
-
-        self.get_params(grad_req)
-
-        eval_metric = metric.Accuracy()
-        cls_metric = metric.LogLossMetric()
-        bbox_metric = metric.SmoothL1LossMetric()
-        eval_metrics = mx.metric.CompositeEvalMetric()
-        for child_metric in [eval_metric, cls_metric, bbox_metric]:
-            eval_metrics.add(child_metric)
-        max_data_shape = self.max_data_shape
-
-        self.optimizer = mx.optimizer.create(self.optimizer, rescale_grad=(1.0 / config.TRAIN.BATCH_SIZE), **self.kwargs)
-        mx.model._train_multi_device(self.symbol, self.ctx, self.arg_names, self.param_names,
-                                     self.aux_names, self.arg_params, self.aux_params,
-                                     begin_epoch=self.begin_epoch, end_epoch=self.num_epoch,
-                                     epoch_size=None, optimizer=self.optimizer,
-                                     train_data=train_data, eval_data=None,
-                                     eval_metric=eval_metrics,
-                                     epoch_end_callback=epoch_end_callback,
-                                     batch_end_callback=batch_end_callback,
-                                     kvstore=kvstore, update_on_kvstore=update_on_kvstore,
-                                     logger=logger, work_load_list=None, monitor=None,
-                                     mutable_data_shape=True, max_data_shape=self.max_data_shape)
diff --git a/example/rcnn/rcnn/symbol.py b/example/rcnn/rcnn/symbol.py
index dcbbf53ece74..e483fdc4f0b2 100644
--- a/example/rcnn/rcnn/symbol.py
+++ b/example/rcnn/rcnn/symbol.py
@@ -1,18 +1,14 @@
 import mxnet as mx
+import rpn.proposal
+from config import config
 
 
-def get_symbol_vgg(num_classes=21):
+def get_vgg_conv(data):
     """
-    Fast R-CNN with VGG 16 conv layers
-    :param num_classes: used to determine output size
+    shared convolutional layers
+    :param data: Symbol
     :return: Symbol
     """
-    data = mx.symbol.Variable(name="data")
-    rois = mx.symbol.Variable(name='rois')
-    cls_prob_label = mx.symbol.Variable(name='cls_prob_label')
-    bbox_loss_target = mx.symbol.Variable(name='bbox_loss_target')
-    bbox_loss_inside_weight = mx.symbol.Variable(name='bbox_loss_inside_weight')
-    bbox_loss_outside_weight = mx.symbol.Variable(name='bbox_loss_outside_weight')
     # group 1
     conv1_1 = mx.symbol.Convolution(
         data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
@@ -65,6 +61,34 @@ def get_symbol_vgg(num_classes=21):
     conv5_3 = mx.symbol.Convolution(
         data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
     relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
+
+    return relu5_3
+
+
+def get_vgg_rcnn(num_classes=21):
+    """
+    Fast R-CNN with VGG 16 conv layers
+    :param num_classes: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    rois = mx.symbol.Variable(name='rois')
+    label = mx.symbol.Variable(name='label')
+    bbox_target = mx.symbol.Variable(name='bbox_target')
+    bbox_inside_weight = mx.symbol.Variable(name='bbox_inside_weight')
+    bbox_outside_weight = mx.symbol.Variable(name='bbox_outside_weight')
+
+    # reshape input
+    rois = mx.symbol.Reshape(data=rois, shape=(-1, 5), name='rois_reshape')
+    label = mx.symbol.Reshape(data=label, shape=(-1, ), name='label_reshape')
+    bbox_target = mx.symbol.Reshape(data=bbox_target, shape=(-1, 4 * num_classes), name='bbox_target_reshape')
+    bbox_inside_weight = mx.symbol.Reshape(data=bbox_inside_weight, shape=(-1, 4 * num_classes), name='bbox_inside_weight_reshape')
+    bbox_outside_weight = mx.symbol.Reshape(data=bbox_outside_weight, shape=(-1, 4 * num_classes), name='bbox_outside_weight_reshape')
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # Fast R-CNN
     pool5 = mx.symbol.ROIPooling(
         name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=0.0625)
     # group 6
@@ -78,78 +102,39 @@ def get_symbol_vgg(num_classes=21):
     drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
     # classification
     cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes)
-    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=cls_prob_label)
+    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label)
     # bounding box regression
     bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
-    bbox_loss_ = bbox_loss_outside_weight * \
+    bbox_loss_ = bbox_outside_weight * \
                  mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0,
-                                     data=bbox_loss_inside_weight * (bbox_pred - bbox_loss_target))
+                                     data=bbox_inside_weight * (bbox_pred - bbox_target))
     bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TRAIN.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape')
+    bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(config.TRAIN.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_loss_reshape')
+
     # group output
     group = mx.symbol.Group([cls_prob, bbox_loss])
     return group
 
 
-def get_symbol_vgg_test(num_classes=21):
+def get_vgg_rcnn_test(num_classes=21):
     """
-    Fast R-CNN test with VGG 16 conv layers
+    Fast R-CNN Network with VGG
     :param num_classes: used to determine output size
     :return: Symbol
     """
     data = mx.symbol.Variable(name="data")
     rois = mx.symbol.Variable(name='rois')
-    # group 1
-    conv1_1 = mx.symbol.Convolution(
-        data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
-    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
-    conv1_2 = mx.symbol.Convolution(
-        data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
-    relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
-    pool1 = mx.symbol.Pooling(
-        data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")
-    # group 2
-    conv2_1 = mx.symbol.Convolution(
-        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
-    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
-    conv2_2 = mx.symbol.Convolution(
-        data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
-    relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
-    pool2 = mx.symbol.Pooling(
-        data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")
-    # group 3
-    conv3_1 = mx.symbol.Convolution(
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
-    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
-    conv3_2 = mx.symbol.Convolution(
-        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
-    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
-    conv3_3 = mx.symbol.Convolution(
-        data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
-    relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
-    pool3 = mx.symbol.Pooling(
-        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool3")
-    # group 4
-    conv4_1 = mx.symbol.Convolution(
-        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
-    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
-    conv4_2 = mx.symbol.Convolution(
-        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
-    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
-    conv4_3 = mx.symbol.Convolution(
-        data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
-    relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
-    pool4 = mx.symbol.Pooling(
-        data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")
-    # group 5
-    conv5_1 = mx.symbol.Convolution(
-        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
-    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
-    conv5_2 = mx.symbol.Convolution(
-        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
-    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
-    conv5_3 = mx.symbol.Convolution(
-        data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
-    relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
+
+    # reshape rois
+    rois = mx.symbol.Reshape(data=rois, shape=(-1, 5), name='rois_reshape')
+
+    # shared convolutional layer
+    relu5_3 = get_vgg_conv(data)
+    
+    # Fast R-CNN
     pool5 = mx.symbol.ROIPooling(
         name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=0.0625)
     # group 6
@@ -166,6 +151,151 @@ def get_symbol_vgg_test(num_classes=21):
     cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score)
     # bounding box regression
     bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape')
+    bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape')
+
     # group output
     group = mx.symbol.Group([cls_prob, bbox_pred])
     return group
+
+
+def get_vgg_rpn(num_classes=21, num_anchors=9):
+    """
+    Region Proposal Network with VGG
+    :param num_classes: used to determine output size
+    :param num_anchors: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    label = mx.symbol.Variable(name='label')
+    bbox_target = mx.symbol.Variable(name='bbox_target')
+    bbox_inside_weight = mx.symbol.Variable(name='bbox_inside_weight')
+    bbox_outside_weight = mx.symbol.Variable(name='bbox_outside_weight')
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+
+    # prepare rpn data
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1), name="rpn_cls_score_reshape")
+
+    # classification
+    cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=label, multi_output=True,
+                                       normalization='valid', use_ignore=True, ignore_label=-1, name="cls_prob")
+    # bounding box regression
+    bbox_loss_ = bbox_outside_weight * \
+                 mx.symbol.smooth_l1(name='bbox_loss_', scalar=3.0,
+                                     data=bbox_inside_weight * (rpn_bbox_pred - bbox_target))
+    bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_)
+    # group output
+    group = mx.symbol.Group([cls_prob, bbox_loss])
+    return group
+
+
+def get_vgg_rpn_test(num_classes=21, num_anchors=9):
+    """
+    Region Proposal Network with VGG
+    :param num_classes: used to determine output size
+    :param num_anchors: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    im_info = mx.symbol.Variable(name="im_info")
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+
+    # ROI Proposal
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape")
+    rpn_cls_prob = mx.symbol.SoftmaxActivation(
+        data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob")
+    rpn_cls_prob_reshape = mx.symbol.Reshape(
+        data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
+    group = mx.symbol.Custom(
+        cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
+        op_type='proposal', feat_stride=16, scales=(8, 16, 32), ratios=(0.5, 1, 2), output_score=True)
+    # rois = group[0]
+    # score = group[1]
+
+    return group
+
+
+def get_vgg_test(num_classes=21, num_anchors=9):
+    """
+    Faster R-CNN test with VGG 16 conv layers
+    :param num_classes: used to determine output size
+    :param num_anchors: used to determine output size
+    :return: Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    im_info = mx.symbol.Variable(name="im_info")
+
+    # shared convolutional layers
+    relu5_3 = get_vgg_conv(data)
+
+    # RPN
+    rpn_conv = mx.symbol.Convolution(
+        data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3")
+    rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu")
+    rpn_cls_score = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score")
+    rpn_bbox_pred = mx.symbol.Convolution(
+        data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred")
+
+    # ROI Proposal
+    rpn_cls_score_reshape = mx.symbol.Reshape(
+        data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape")
+    rpn_cls_prob = mx.symbol.SoftmaxActivation(
+        data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob")
+    rpn_cls_prob_reshape = mx.symbol.Reshape(
+        data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
+    rois = mx.symbol.Custom(
+        cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
+        op_type='proposal', feat_stride=16, scales=(8, 16, 32), ratios=(0.5, 1, 2))
+
+    # Fast R-CNN
+    pool5 = mx.symbol.ROIPooling(
+        name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=0.0625)
+    # group 6
+    flatten = mx.symbol.Flatten(data=pool5, name="flatten")
+    fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+    relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6")
+    drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
+    # group 7
+    fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+    relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7")
+    drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
+    # classification
+    cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes)
+    cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score)
+    # bounding box regression
+    bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4)
+
+    # reshape output
+    cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape')
+    bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape')
+
+    # group output
+    group = mx.symbol.Group([rois, cls_prob, bbox_pred])
+    return group
diff --git a/example/rcnn/rcnn/tester.py b/example/rcnn/rcnn/tester.py
index 3f69daa6f822..0dc253e3878b 100644
--- a/example/rcnn/rcnn/tester.py
+++ b/example/rcnn/rcnn/tester.py
@@ -19,7 +19,7 @@ def pred_eval(detector, test_data, imdb, vis=False):
     """
     assert not test_data.shuffle
 
-    thresh = 0.1
+    thresh = 0.05
     # limit detections to max_per_image over all classes
     max_per_image = 100
 
@@ -35,15 +35,17 @@ def pred_eval(detector, test_data, imdb, vis=False):
         if i % 10 == 0:
             print 'testing {}/{}'.format(i, imdb.num_images)
 
-        scores, boxes = detector.im_detect(databatch.data['data'], databatch.data['rois'])
-
-        # we used scaled image & roi to train, so it is necessary to transform them back
-        # visualization should also be from the original size
-        im_path = imdb.image_path_from_index(imdb.image_set_index[i])
-        im = cv2.imread(im_path)
-        im_height = im.shape[0]
-        scale = float(databatch.data['data'].shape[2]) / float(im_height)
-        im = image_processing.transform(im, config.PIXEL_MEANS)
+        if config.TEST.HAS_RPN:
+            scores, boxes = detector.im_detect(databatch.data['data'], im_info=databatch.data['im_info'])
+            scale = databatch.data['im_info'][0, 2]
+        else:
+            scores, boxes = detector.im_detect(databatch.data['data'], roi_array=databatch.data['rois'])
+            # we used scaled image & roi to train, so it is necessary to transform them back
+            # visualization should also be from the original size
+            im_path = imdb.image_path_from_index(imdb.image_set_index[i])
+            im = cv2.imread(im_path)
+            im_height = im.shape[0]
+            scale = float(databatch.data['data'].shape[2]) / float(im_height)
 
         for j in range(1, imdb.num_classes):
             indexes = np.where(scores[:, j] > thresh)[0]
@@ -64,7 +66,11 @@ def pred_eval(detector, test_data, imdb, vis=False):
 
         boxes_this_image = [[]] + [all_boxes[j][i] for j in range(1, imdb.num_classes)]
         if vis:
-            vis_all_detection(im, boxes_this_image,
+            # visualize the testing scale
+            for box in boxes_this_image:
+                if isinstance(box, np.ndarray):
+                    box[:, :4] *= scale
+            vis_all_detection(databatch.data['data'], boxes_this_image,
                               imdb_classes=imdb.classes)
         i += 1
 
@@ -78,7 +84,7 @@ def pred_eval(detector, test_data, imdb, vis=False):
     imdb.evaluate_detections(all_boxes)
 
 
-def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.):
+def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.7):
     """
     visualize all detections in one image
     :param im_array: [b=1 c h w] in rgb
@@ -101,8 +107,9 @@ def vis_all_detection(im_array, detections, imdb_classes=None, thresh=0.):
                 rect = plt.Rectangle((bbox[0], bbox[1]),
                                      bbox[2] - bbox[0],
                                      bbox[3] - bbox[1], fill=False,
-                                     edgecolor=color, linewidth=2)
+                                     edgecolor=color, linewidth=3.5)
                 plt.gca().add_patch(rect)
-                plt.gca().annotate('{} {:.3f}'.format(imdb_classes[j], score),
-                                   rect.get_xy(), color='w')
+                plt.gca().text(bbox[0], bbox[1] - 2,
+                               '{:s} {:.3f}'.format(imdb_classes[j], score),
+                               bbox=dict(facecolor=color, alpha=0.5), fontsize=12, color='white')
     plt.show()
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
index be183c9ef7d1..74ffc40673c2 100644
--- a/example/rcnn/test.py
+++ b/example/rcnn/test.py
@@ -1,29 +1,12 @@
 import argparse
-import mxnet as mx
 import os
-from tools.test_net import test_net
 
+import mxnet as mx
 
-def parse_args():
-    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
-    parser.add_argument('--image_set', dest='image_set', help='can be test',
-                        default='test', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=9, type=int)
-    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
-                        default=0, type=int)
-    args = parser.parse_args()
-    return args
+from tools.test_rcnn import test_rcnn
+from tools.test_rcnn import parse_args
 
 if __name__ == '__main__':
     args = parse_args()
     ctx = mx.gpu(args.gpu_id)
-    test_net(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx)
+    test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis, args.has_rpn)
diff --git a/example/rcnn/tools/demo_net.py b/example/rcnn/tools/demo_net.py
deleted file mode 100644
index 4e9cdf7cd3eb..000000000000
--- a/example/rcnn/tools/demo_net.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import numpy as np
-import cv2
-import scipy.io as sio
-from helper.processing.image_processing import resize, transform
-from rcnn.config import config
-from helper.processing.nms import nms
-from rcnn.tester import vis_all_detection
-
-CLASSES = ('__background__',
-           'aeroplane', 'bicycle', 'bird', 'boat',
-           'bottle', 'bus', 'car', 'cat', 'chair',
-           'cow', 'diningtable', 'dog', 'horse',
-           'motorbike', 'person', 'pottedplant',
-           'sheep', 'sofa', 'train', 'tvmonitor')
-
-
-def demo_net(detector, image_name):
-    """
-    wrapper for detector
-    :param detector: Detector
-    :param image_name: image name
-    :return: None
-    """
-    # load demo data
-    im = cv2.imread(image_name + '.jpg')
-    im_array, im_scale = resize(im, config.TEST.SCALES[0], config.TRAIN.MAX_SIZE)
-    im_array = transform(im_array, config.PIXEL_MEANS)
-    roi_array = sio.loadmat(image_name + '_boxes.mat')['boxes']
-    batch_index_array = np.zeros((roi_array.shape[0], 1))
-    projected_rois = roi_array * im_scale
-    roi_array = np.hstack((batch_index_array, projected_rois))
-
-    scores, boxes = detector.im_detect(im_array, roi_array)
-
-    all_boxes = [[] for _ in CLASSES]
-    CONF_THRESH = 0.8
-    NMS_THRESH = 0.3
-    for cls in CLASSES:
-        cls_ind = CLASSES.index(cls)
-        cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)]
-        cls_scores = scores[:, cls_ind]
-        keep = np.where(cls_scores >= CONF_THRESH)[0]
-        cls_boxes = cls_boxes[keep, :]
-        cls_scores = cls_scores[keep]
-        dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])).astype(np.float32)
-        keep = nms(dets, NMS_THRESH)
-        all_boxes[cls_ind] = dets[keep, :]
-
-    boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))]
-    vis_all_detection(im_array, boxes_this_image, CLASSES, 0)
diff --git a/example/rcnn/tools/load_data.py b/example/rcnn/tools/load_data.py
deleted file mode 100644
index 2ecdb512cb13..000000000000
--- a/example/rcnn/tools/load_data.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from helper.dataset.pascal_voc import PascalVOC
-from helper.processing.roidb import prepare_roidb, add_bbox_regression_targets
-
-
-def load_train_roidb(image_set, year, root_path, devkit_path, flip=False):
-    voc = PascalVOC(image_set, year, root_path, devkit_path)
-    gt_roidb = voc.gt_roidb()
-    ss_roidb = voc.selective_search_roidb(gt_roidb)
-    if flip:
-        ss_roidb = voc.append_flipped_images(ss_roidb)
-    prepare_roidb(voc, ss_roidb)
-    means, stds = add_bbox_regression_targets(ss_roidb)
-    return voc, ss_roidb, means, stds
-
-
-def load_test_roidb(image_set, year, root_path, devkit_path):
-    voc = PascalVOC(image_set, year, root_path, devkit_path)
-    gt_roidb = voc.gt_roidb()
-    ss_roidb = voc.selective_search_roidb(gt_roidb)
-    prepare_roidb(voc, ss_roidb)
-    return voc, ss_roidb
diff --git a/example/rcnn/tools/test_net.py b/example/rcnn/tools/test_net.py
deleted file mode 100644
index 1c0a763d24d3..000000000000
--- a/example/rcnn/tools/test_net.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import logging
-from load_data import load_test_roidb
-from rcnn.data_iter import ROIIter
-from rcnn.symbol import get_symbol_vgg_test
-from load_model import load_param
-from rcnn.detector import Detector
-from rcnn.tester import pred_eval
-
-
-def test_net(imageset, year, root_path, devkit_path, prefix, epoch, ctx):
-    """
-    wrapper for detector
-    :param imageset: image set to test on
-    :param year: year of image set
-    :param root_path: 'data' folder path
-    :param devkit_path: 'VOCdevkit' folder path
-    :param prefix: new model prefix
-    :param epoch: new model epoch
-    :param ctx: context to evaluate in
-    :return: None
-    """
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load testing data
-    voc, roidb = load_test_roidb(imageset, year, root_path, devkit_path)
-    test_data = ROIIter(roidb, ctx=ctx, batch_size=1, shuffle=False, mode='test')
-
-    # load model
-    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
-
-    # load symbol
-    sym = get_symbol_vgg_test()
-
-    # detect
-    detector = Detector(sym, ctx, args, auxs)
-    pred_eval(detector, test_data, voc, vis=False)
diff --git a/example/rcnn/tools/test_rcnn.py b/example/rcnn/tools/test_rcnn.py
new file mode 100644
index 000000000000..fdbc92c97acf
--- /dev/null
+++ b/example/rcnn/tools/test_rcnn.py
@@ -0,0 +1,65 @@
+import argparse
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.detector import Detector
+from rcnn.symbol import get_vgg_test, get_vgg_rcnn_test
+from rcnn.tester import pred_eval
+from utils.load_data import load_gt_roidb, load_test_ss_roidb, load_test_rpn_roidb
+from utils.load_model import load_param
+
+
+def test_rcnn(imageset, year, root_path, devkit_path, prefix, epoch, ctx, vis=False, has_rpn=True, proposal='rpn'):
+    # load symbol and testing data
+    if has_rpn:
+        sym = get_vgg_test()
+        config.TEST.HAS_RPN = True
+        config.TEST.RPN_PRE_NMS_TOP_N = 6000
+        config.TEST.RPN_POST_NMS_TOP_N = 300
+        voc, roidb = load_gt_roidb(imageset, year, root_path, devkit_path)
+    else:
+        sym = get_vgg_rcnn_test()
+        voc, roidb = eval('load_test_' + proposal + '_roidb')(imageset, year, root_path, devkit_path)
+
+    # get test data iter
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load model
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # detect
+    detector = Detector(sym, ctx, args, auxs)
+    pred_eval(detector, test_data, voc, vis=vis)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
+    parser.add_argument('--image_set', dest='image_set', help='can be test',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='model to test with',
+                        default=8, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to test with',
+                        default=0, type=int)
+    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    parser.add_argument('--has_rpn', dest='has_rpn', help='generate proposals on the fly',
+                        action='store_true')
+    parser.add_argument('--proposal', dest='proposal', help='can be ss for selective search or rpn',
+                        default='rpn', type=str)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis,
+              args.has_rpn, args.proposal)
diff --git a/example/rcnn/tools/test_rpn.py b/example/rcnn/tools/test_rpn.py
new file mode 100644
index 000000000000..b93c1753a42c
--- /dev/null
+++ b/example/rcnn/tools/test_rpn.py
@@ -0,0 +1,58 @@
+import argparse
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.rpn.generate import Detector, generate_detections
+from rcnn.symbol import get_vgg_rpn_test
+from utils.load_data import load_gt_roidb
+from utils.load_model import load_param
+
+# rpn generate proposal config
+config.TEST.HAS_RPN = True
+config.TEST.RPN_PRE_NMS_TOP_N = -1
+config.TEST.RPN_POST_NMS_TOP_N = 2000
+
+
+def test_rpn(image_set, year, root_path, devkit_path, prefix, epoch, ctx, vis=False):
+    # load symbol
+    sym = get_vgg_rpn_test()
+
+    # load testing data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path)
+    test_data = ROIIter(roidb, batch_size=1, shuffle=False, mode='test')
+
+    # load model
+    args, auxs = load_param(prefix, epoch, convert=True, ctx=ctx)
+
+    # start testing
+    detector = Detector(sym, ctx, args, auxs)
+    imdb_boxes = generate_detections(detector, test_data, voc, vis=vis)
+    voc.evaluate_recall(roidb, candidate_boxes=imdb_boxes)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Test a Region Proposal Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--prefix', dest='prefix', help='model to test with', type=str)
+    parser.add_argument('--epoch', dest='epoch', help='model to test with',
+                        default=8, type=int)
+    parser.add_argument('--gpu', dest='gpu_id', help='GPU device to train with',
+                        default=0, type=int)
+    parser.add_argument('--vis', dest='vis', help='turn on visualization', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = mx.gpu(args.gpu_id)
+    test_rpn(args.image_set, args.year, args.root_path, args.devkit_path, args.prefix, args.epoch, ctx, args.vis)
diff --git a/example/rcnn/tools/train_net.py b/example/rcnn/tools/train_net.py
deleted file mode 100644
index ad4552b54af5..000000000000
--- a/example/rcnn/tools/train_net.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import mxnet as mx
-import logging
-from rcnn.config import config
-from load_data import load_train_roidb
-from rcnn.data_iter import ROIIter
-from rcnn.symbol import get_symbol_vgg
-from load_model import load_checkpoint, load_param
-from rcnn.solver import Solver
-from save_model import save_checkpoint
-
-
-def train_net(image_set, year, root_path, devkit_path, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None):
-    """
-    wrapper for solver
-    :param image_set: image set to train on
-    :param year: year of image set
-    :param root_path: 'data' folder
-    :param devkit_path: 'VOCdevkit' folder
-    :param pretrained: prefix of pretrained model
-    :param epoch: epoch of pretrained model
-    :param prefix: prefix of new model
-    :param ctx: context to train in
-    :param begin_epoch: begin epoch number
-    :param end_epoch: end epoch number
-    :param frequent: frequency to print
-    :return: None
-    """
-    # set up logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # load training data
-    voc, roidb, means, stds = load_train_roidb(image_set, year, root_path, devkit_path, flip=True)
-    train_data = ROIIter(roidb, ctx=ctx,  batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train', work_load_list=work_load_list)
-
-    # load pretrained
-    args, auxs = load_param(pretrained, epoch, convert=True, ctx=ctx[0])
-    del args['fc8_bias']
-    del args['fc8_weight']
-
-    # load symbol
-    sym = get_symbol_vgg()
-
-    # initialize params
-    arg_shape, _, _ = sym.infer_shape(data=(1, 3, 224, 224), rois=(1, 5))
-    arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
-    args['cls_score_weight'] = mx.random.normal(mean=0, stdvar=0.01, shape=arg_shape_dict['cls_score_weight'], ctx=ctx[0])
-    args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'], ctx=ctx[0])
-    args['bbox_pred_weight'] = mx.random.normal(mean=0, stdvar=0.001, shape=arg_shape_dict['bbox_pred_weight'], ctx=ctx[0])
-    args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'], ctx=ctx[0])
-
-    # train
-    solver = Solver(prefix, sym, ctx, begin_epoch, end_epoch, kv_store, args, auxs, momentum=0.9, wd=0.0005,
-                    learning_rate=0.001, lr_scheduler=mx.lr_scheduler.FactorScheduler(30000, 0.1), max_data_shape=[('data', (1, 3, 1000, 1000))])
-    solver.fit(train_data, frequent=frequent)
-
-    # edit params and save
-    for epoch in range(begin_epoch + 1, end_epoch + 1):
-        arg_params, aux_params = load_checkpoint(prefix, epoch)
-        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds, ctx=ctx[0])).T
-        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds, ctx=ctx[0]) + \
-                                       mx.nd.array(means, ctx=ctx[0])
-        save_checkpoint(prefix, epoch, arg_params, aux_params)
diff --git a/example/rcnn/tools/train_rcnn.py b/example/rcnn/tools/train_rcnn.py
new file mode 100644
index 000000000000..432c6a950a79
--- /dev/null
+++ b/example/rcnn/tools/train_rcnn.py
@@ -0,0 +1,138 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.callback import Speedometer
+from rcnn.config import config
+from rcnn.loader import ROIIter
+from rcnn.metric import AccuracyMetric, LogLossMetric, SmoothL1LossMetric
+from rcnn.module import MutableModule
+from rcnn.symbol import get_vgg_rcnn
+from utils.load_data import load_ss_roidb, load_rpn_roidb
+from utils.load_model import load_checkpoint, load_param
+from utils.save_model import save_checkpoint
+
+
+def train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
+               prefix, ctx, begin_epoch, end_epoch, frequent, kv_store,
+               work_load_list=None, resume=False, proposal='rpn'):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load symbol
+    sym = get_vgg_rcnn()
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb, means, stds = eval('load_' + proposal + '_roidb')(image_set, year, root_path, devkit_path, flip=True)
+    train_data = ROIIter(roidb, batch_size=config.TRAIN.BATCH_IMAGES, shuffle=True, mode='train',
+                         ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, 1000, 1000))]
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    if not resume:
+        input_shapes = {k: v for k, v in train_data.provide_data + train_data.provide_label}
+        arg_shape, _, _ = sym.infer_shape(**input_shapes)
+        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+        args['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight'])
+        args['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias'])
+        args['bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['bbox_pred_weight'])
+        args['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias'])
+
+    # prepare training
+    if config.TRAIN.FINETUNE:
+        fixed_param_prefix = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
+    else:
+        fixed_param_prefix = ['conv1', 'conv2']
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
+    epoch_end_callback = mx.callback.do_checkpoint(prefix)
+    if config.TRAIN.HAS_RPN is True:
+        eval_metric = AccuracyMetric(use_ignore=True, ignore=-1)
+        cls_metric = LogLossMetric(use_ignore=True, ignore=-1)
+    else:
+        eval_metric = AccuracyMetric()
+        cls_metric = LogLossMetric()
+    bbox_metric = SmoothL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    optimizer_params = {'momentum': 0.9,
+                        'wd': 0.0005,
+                        'learning_rate': 0.001,
+                        'lr_scheduler': mx.lr_scheduler.FactorScheduler(30000, 0.1),
+                        'rescale_grad': (1.0 / config.TRAIN.BATCH_SIZE)}
+
+    # train
+    mod = MutableModule(sym, data_names=data_names, label_names=label_names,
+                        logger=logger, context=ctx, work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape, fixed_param_prefix=fixed_param_prefix)
+    mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback, kvstore=kv_store,
+            optimizer='sgd', optimizer_params=optimizer_params,
+            arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
+
+    # edit params and save
+    for epoch in range(begin_epoch + 1, end_epoch + 1):
+        arg_params, aux_params = load_checkpoint(prefix, epoch)
+        arg_params['bbox_pred_weight'] = (arg_params['bbox_pred_weight'].T * mx.nd.array(stds)).T
+        arg_params['bbox_pred_bias'] = arg_params['bbox_pred_bias'] * mx.nd.array(stds) + \
+                                       mx.nd.array(means)
+        save_checkpoint(prefix, epoch, arg_params, aux_params)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a Fast R-CNN Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'rcnn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    parser.add_argument('--finetune', dest='finetune', help='second round finetune', action='store_true')
+    parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
+    parser.add_argument('--proposal', dest='proposal', help='can be ss for selective search or rpn',
+                        default='rpn', type=str)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    if args.finetune:
+        config.TRAIN.FINETUNE = True
+    train_rcnn(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+               args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
+               args.kv_store, args.work_load_list, args.resume, args.proposal)
diff --git a/example/rcnn/tools/train_rpn.py b/example/rcnn/tools/train_rpn.py
new file mode 100644
index 000000000000..1b3f489b490b
--- /dev/null
+++ b/example/rcnn/tools/train_rpn.py
@@ -0,0 +1,144 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.callback import Speedometer
+from rcnn.config import config
+from rcnn.loader import AnchorLoader
+from rcnn.metric import AccuracyMetric, LogLossMetric, SmoothL1LossMetric
+from rcnn.module import MutableModule
+from rcnn.symbol import get_vgg_rpn
+from utils.load_data import load_gt_roidb
+from utils.load_model import load_param
+
+# rpn config
+config.TRAIN.HAS_RPN = True
+config.TRAIN.BATCH_SIZE = 1
+
+
+def train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
+              prefix, ctx, begin_epoch, end_epoch, frequent, kv_store, work_load_list=None, resume=False):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+
+    # load symbol
+    sym = get_vgg_rpn()
+    feat_sym = get_vgg_rpn().get_internals()['rpn_cls_score_output']
+
+    # setup multi-gpu
+    config.TRAIN.BATCH_IMAGES *= len(ctx)
+    config.TRAIN.BATCH_SIZE *= len(ctx)
+
+    # load training data
+    voc, roidb = load_gt_roidb(image_set, year, root_path, devkit_path, flip=True)
+    train_data = AnchorLoader(feat_sym, roidb, batch_size=config.TRAIN.BATCH_SIZE, shuffle=True, mode='train',
+                              ctx=ctx, work_load_list=work_load_list)
+
+    # infer max shape
+    max_data_shape = [('data', (config.TRAIN.BATCH_SIZE, 3, 1000, 1000))]
+    max_data_shape_dict = {k: v for k, v in max_data_shape}
+    _, feat_shape, _ = feat_sym.infer_shape(**max_data_shape_dict)
+    from rcnn.minibatch import assign_anchor
+    import numpy as np
+    label = assign_anchor(feat_shape[0], np.zeros((0, 5)), [[1000, 1000, 1.0]])
+    max_label_shape = [('label', label['label'].shape),
+                       ('bbox_target', label['bbox_target'].shape),
+                       ('bbox_inside_weight', label['bbox_inside_weight'].shape),
+                       ('bbox_outside_weight', label['bbox_outside_weight'].shape)]
+    print 'providing maximum shape', max_data_shape, max_label_shape
+
+    # load pretrained
+    args, auxs = load_param(pretrained, epoch, convert=True)
+
+    # initialize params
+    if not resume:
+        input_shapes = {k: v for k, v in train_data.provide_data + train_data.provide_label}
+        arg_shape, _, _ = sym.infer_shape(**input_shapes)
+        arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
+        args['rpn_conv_3x3_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_conv_3x3_weight'])
+        args['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias'])
+        args['rpn_cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_cls_score_weight'])
+        args['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias'])
+        args['rpn_bbox_pred_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_bbox_pred_weight'])
+        args['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias'])
+
+    # prepare training
+    if config.TRAIN.FINETUNE:
+        fixed_param_prefix = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5']
+    else:
+        fixed_param_prefix = ['conv1', 'conv2']
+    data_names = [k[0] for k in train_data.provide_data]
+    label_names = [k[0] for k in train_data.provide_label]
+    batch_end_callback = Speedometer(train_data.batch_size, frequent=frequent)
+    epoch_end_callback = mx.callback.do_checkpoint(prefix)
+    if config.TRAIN.HAS_RPN is True:
+        eval_metric = AccuracyMetric(use_ignore=True, ignore=-1)
+        cls_metric = LogLossMetric(use_ignore=True, ignore=-1)
+    else:
+        eval_metric = AccuracyMetric()
+        cls_metric = LogLossMetric()
+    bbox_metric = SmoothL1LossMetric()
+    eval_metrics = mx.metric.CompositeEvalMetric()
+    for child_metric in [eval_metric, cls_metric, bbox_metric]:
+        eval_metrics.add(child_metric)
+    optimizer_params = {'momentum': 0.9,
+                        'wd': 0.0005,
+                        'learning_rate': 0.001,
+                        'lr_scheduler': mx.lr_scheduler.FactorScheduler(60000, 0.1),
+                        'rescale_grad': (1.0 / config.TRAIN.BATCH_SIZE)}
+
+    # train
+    mod = MutableModule(sym, data_names=data_names, label_names=label_names,
+                        logger=logger, context=ctx, work_load_list=work_load_list,
+                        max_data_shapes=max_data_shape, max_label_shapes=max_label_shape,
+                        fixed_param_prefix=fixed_param_prefix)
+    mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback,
+            batch_end_callback=batch_end_callback, kvstore=kv_store,
+            optimizer='sgd', optimizer_params=optimizer_params,
+            arg_params=args, aux_params=auxs, begin_epoch=begin_epoch, num_epoch=end_epoch)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a Region Proposal Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'rpn'), type=str)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    parser.add_argument('--finetune', dest='finetune', help='second round finetune', action='store_true')
+    parser.add_argument('--resume', dest='resume', help='continue training', action='store_true')
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    if args.finetune:
+        config.TRAIN.FINETUNE = True
+    train_rpn(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
+              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent,
+              args.kv_store, args.work_load_list, args.resume)
diff --git a/example/rcnn/train.py b/example/rcnn/train.py
deleted file mode 100644
index ad61855ae50f..000000000000
--- a/example/rcnn/train.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import argparse
-import mxnet as mx
-import os
-from tools.train_net import train_net
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')
-    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
-                        default='trainval', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--root_path', dest='root_path', help='output data folder',
-                        default=os.path.join(os.getcwd(), 'data'), type=str)
-    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=1, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'frcnn'), type=str)
-    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
-                        default='0', type=str)
-    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
-                        default=0, type=int)
-    parser.add_argument('--end_epoch', dest='end_epoch', help='end epoch of training',
-                        default=8, type=int)
-    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
-                        default=20, type=int)
-    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
-                        default='local', type=str)
-    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
-                        default=None, type=list)
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
-    train_net(args.image_set, args.year, args.root_path, args.devkit_path, args.pretrained, args.epoch,
-              args.prefix, ctx, args.begin_epoch, args.end_epoch, args.frequent, args.kv_store, args.work_load_list)
diff --git a/example/rcnn/train_alternate.py b/example/rcnn/train_alternate.py
new file mode 100644
index 000000000000..5e3ba7f07780
--- /dev/null
+++ b/example/rcnn/train_alternate.py
@@ -0,0 +1,104 @@
+import argparse
+import logging
+import os
+
+import mxnet as mx
+
+from rcnn.config import config
+from rcnn.loader import AnchorLoader, ROIIter
+from tools.train_rpn import train_rpn
+from tools.train_rcnn import train_rcnn
+from tools.test_rpn import test_rpn
+from utils.combine_model import combine_model
+
+
+def alternate_train(image_set, test_image_set, year, root_path, devkit_path, pretrained, epoch,
+                    ctx, begin_epoch, rpn_epoch, rcnn_epoch, frequent, kv_store, work_load_list=None):
+    # set up logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    config.TRAIN.BG_THRESH_LO = 0.0
+
+    logging.info('########## TRAIN RPN WITH IMAGENET INIT')
+    config.TRAIN.HAS_RPN = True
+    config.TRAIN.BATCH_SIZE = 1
+    train_rpn(image_set, year, root_path, devkit_path, pretrained, epoch,
+              'model/rpn1', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## GENERATE RPN DETECTION')
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn1', rpn_epoch, ctx[0])
+
+    logging.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION')
+    config.TRAIN.HAS_RPN = False
+    config.TRAIN.BATCH_SIZE = 128
+    train_rcnn(image_set, year, root_path, devkit_path, pretrained, epoch,
+               'model/rcnn1', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## TRAIN RPN WITH RCNN INIT')
+    config.TRAIN.HAS_RPN = True
+    config.TRAIN.BATCH_SIZE = 1
+    config.TRAIN.FINETUNE = True
+    train_rpn(image_set, year, root_path, devkit_path, 'model/rcnn1', rcnn_epoch,
+              'model/rpn2', ctx, begin_epoch, rpn_epoch, frequent, kv_store, work_load_list)
+
+    logging.info('########## GENERATE RPN DETECTION')
+    config.TEST.HAS_RPN = True
+    config.TEST.RPN_PRE_NMS_TOP_N = -1
+    config.TEST.RPN_POST_NMS_TOP_N = 2000
+    test_rpn(image_set, year, root_path, devkit_path, 'model/rpn2', rpn_epoch, ctx[0])
+
+    logger.info('########## COMBINE RPN2 WITH RCNN1')
+    combine_model('model/rpn2', rpn_epoch, 'model/rcnn1', rcnn_epoch, 'model/rcnn2', 0)
+
+    logger.info('########## TRAIN RCNN WITH RPN INIT AND DETECTION')
+    config.TRAIN.HAS_RPN = False
+    config.TRAIN.BATCH_SIZE = 128
+    train_rcnn(image_set, year, root_path, devkit_path, 'model/rcnn2', 0,
+               'model/rcnn2', ctx, begin_epoch, rcnn_epoch, frequent, kv_store, work_load_list)
+
+    logger.info('########## COMBINE RPN2 WITH RCNN2')
+    combine_model('model/rpn2', rpn_epoch, 'model/rcnn2', rcnn_epoch, 'model/final', 0)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train Faster R-CNN Network')
+    parser.add_argument('--image_set', dest='image_set', help='can be trainval or train',
+                        default='trainval', type=str)
+    parser.add_argument('--test_image_set', dest='test_image_set', help='can be test or val',
+                        default='test', type=str)
+    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
+                        default='2007', type=str)
+    parser.add_argument('--root_path', dest='root_path', help='output data folder',
+                        default=os.path.join(os.getcwd(), 'data'), type=str)
+    parser.add_argument('--devkit_path', dest='devkit_path', help='VOCdevkit path',
+                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
+    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
+                        default=os.path.join(os.getcwd(), 'model', 'vgg16'), type=str)
+    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
+                        default=1, type=int)
+    parser.add_argument('--gpus', dest='gpu_ids', help='GPU device to train with',
+                        default='0', type=str)
+    parser.add_argument('--begin_epoch', dest='begin_epoch', help='begin epoch of training',
+                        default=0, type=int)
+    parser.add_argument('--rpn_epoch', dest='rpn_epoch', help='end epoch of rpn training',
+                        default=8, type=int)
+    parser.add_argument('--rcnn_epoch', dest='rcnn_epoch', help='end epoch of rcnn training',
+                        default=8, type=int)
+    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
+                        default=20, type=int)
+    parser.add_argument('--kv_store', dest='kv_store', help='the kv-store type',
+                        default='device', type=str)
+    parser.add_argument('--work_load_list', dest='work_load_list', help='work load for different devices',
+                        default=None, type=list)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    ctx = [mx.gpu(int(i)) for i in args.gpu_ids.split(',')]
+    alternate_train(args.image_set, args.test_image_set, args.year, args.root_path, args.devkit_path,
+                    args.pretrained, args.epoch, ctx, args.begin_epoch, args.rpn_epoch, args.rcnn_epoch,
+                    args.frequent, args.kv_store, args.work_load_list)
diff --git a/example/rcnn/utils/__init__.py b/example/rcnn/utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/example/rcnn/utils/caffe_convert.py b/example/rcnn/utils/caffe_convert.py
new file mode 100644
index 000000000000..4dfbfb4e186f
--- /dev/null
+++ b/example/rcnn/utils/caffe_convert.py
@@ -0,0 +1,74 @@
+# This script will not work unless all paths are set right
+
+import os
+import sys
+import mxnet as mx
+import numpy as np
+fast_rcnn_path = None
+sys.path.insert(0, os.path.join(fast_rcnn_path, 'caffe-fast-rcnn', 'python'))
+sys.path.insert(0, os.path.join(fast_rcnn_path, 'lib'))
+import caffe
+from rcnn.symbol import get_symbol_vgg_test
+
+def load_model(caffeproto, caffemodel, arg_shape_dic):
+    def get_caffe_iter(layer_names, layers):
+        for layer_idx, layer in enumerate(layers):
+            layer_name = layer_names[layer_idx].replace('/', '_')
+            layer_type = layer.type
+            layer_blobs = layer.blobs
+            yield (layer_name, layer_type, layer_blobs)
+
+    net_caffe = caffe.Net(caffeproto, caffemodel, caffe.TEST)
+    layer_names = net_caffe._layer_names
+    layers = net_caffe.layers
+    iter = ''
+    iter = get_caffe_iter(layer_names, layers)
+    first_conv = True
+
+    arg_params = {}
+    for layer_name, layer_type, layer_blobs in iter:
+        if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14:
+            assert(len(layer_blobs) == 2)
+            wmat = np.array(layer_blobs[0].data).reshape(layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width)
+            bias = np.array(layer_blobs[1].data)
+            if first_conv:
+                print 'Swapping BGR of caffe into RGB in mxnet'
+                wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
+
+            assert(wmat.flags['C_CONTIGUOUS'] is True)
+            assert(bias.flags['C_CONTIGUOUS'] is True)
+            print 'converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape)
+            wmat = wmat.reshape((wmat.shape[0], -1))
+            bias = bias.reshape((bias.shape[0], 1))
+            weight_name = layer_name + "_weight"
+            bias_name = layer_name + "_bias"
+            
+            if weight_name not in arg_shape_dic:
+                print weight_name + ' not found in arg_shape_dic.'
+                continue
+            wmat = wmat.reshape(arg_shape_dic[weight_name])
+            arg_params[weight_name] = mx.nd.zeros(wmat.shape)
+            arg_params[weight_name][:] = wmat
+
+            bias = bias.reshape(arg_shape_dic[bias_name])
+            arg_params[bias_name] = mx.nd.zeros(bias.shape)
+            arg_params[bias_name][:] = bias
+
+            if first_conv and (layer_type == 'Convolution' or layer_type == 4):
+                first_conv = False
+    
+    return arg_params
+
+proto_path = os.path.join(fast_rcnn_path, 'models', 'VGG16', 'test.prototxt')
+model_path = os.path.join(fast_rcnn_path, 'data', 'fast_rcnn_models', 'vgg16_fast_rcnn_iter_40000.caffemodel')
+
+symbol = get_symbol_vgg_test()
+arg_shapes, out_shapes, aux_shapes = symbol.infer_shape(**{'data': (1, 3, 224, 224), 'rois': (1, 5)})
+arg_shape_dic = { name: shape for name, shape in zip(symbol.list_arguments(), arg_shapes) }
+
+arg_params = load_model(proto_path, model_path, arg_shape_dic)
+
+model = mx.model.FeedForward(ctx=mx.cpu(), symbol=symbol, arg_params=arg_params,
+                             aux_params={}, num_epoch=1,
+                             learning_rate=0.01, momentum=0.9, wd=0.0001)
+model.save('model/ref')
diff --git a/example/rcnn/utils/combine_model.py b/example/rcnn/utils/combine_model.py
new file mode 100644
index 000000000000..5518dda4a989
--- /dev/null
+++ b/example/rcnn/utils/combine_model.py
@@ -0,0 +1,22 @@
+from load_model import load_checkpoint
+from save_model import save_checkpoint
+
+
+def combine_model(prefix1, epoch1, prefix2, epoch2, prefix_out, epoch_out):
+    args1, auxs1 = load_checkpoint(prefix1, epoch1)
+    args2, auxs2 = load_checkpoint(prefix2, epoch2)
+    arg_names = args1.keys() + args2.keys()
+    aux_names = auxs1.keys() + auxs2.keys()
+    args = dict()
+    for arg in arg_names:
+        if arg in args1:
+            args[arg] = args1[arg]
+        else:
+            args[arg] = args2[arg]
+    auxs = dict()
+    for aux in aux_names:
+        if aux in auxs1:
+            auxs[aux] = auxs1[aux]
+        else:
+            auxs[aux] = auxs2[aux]
+    save_checkpoint(prefix_out, epoch_out, args, auxs)
diff --git a/example/rcnn/utils/load_data.py b/example/rcnn/utils/load_data.py
new file mode 100644
index 000000000000..cc6317e0e74a
--- /dev/null
+++ b/example/rcnn/utils/load_data.py
@@ -0,0 +1,49 @@
+from helper.dataset.pascal_voc import PascalVOC
+from helper.processing.roidb import prepare_roidb, add_bbox_regression_targets
+
+
+def load_ss_roidb(image_set, year, root_path, devkit_path, flip=False):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    ss_roidb = voc.selective_search_roidb(gt_roidb)
+    if flip:
+        ss_roidb = voc.append_flipped_images(ss_roidb)
+    prepare_roidb(voc, ss_roidb)
+    means, stds = add_bbox_regression_targets(ss_roidb)
+    return voc, ss_roidb, means, stds
+
+
+def load_gt_roidb(image_set, year, root_path, devkit_path, flip=False):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    if flip:
+        gt_roidb = voc.append_flipped_images(gt_roidb)
+    prepare_roidb(voc, gt_roidb)
+    return voc, gt_roidb
+
+
+def load_rpn_roidb(image_set, year, root_path, devkit_path, flip=False):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    rpn_roidb = voc.rpn_roidb(gt_roidb)
+    if flip:
+        rpn_roidb = voc.append_flipped_images(rpn_roidb)
+    prepare_roidb(voc, rpn_roidb)
+    means, stds = add_bbox_regression_targets(rpn_roidb)
+    return voc, rpn_roidb, means, stds
+
+
+def load_test_ss_roidb(image_set, year, root_path, devkit_path):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    ss_roidb = voc.selective_search_roidb(gt_roidb)
+    prepare_roidb(voc, ss_roidb)
+    return voc, ss_roidb
+
+
+def load_test_rpn_roidb(image_set, year, root_path, devkit_path):
+    voc = PascalVOC(image_set, year, root_path, devkit_path)
+    gt_roidb = voc.gt_roidb()
+    rpn_roidb = voc.rpn_roidb(gt_roidb)
+    prepare_roidb(voc, rpn_roidb)
+    return voc, rpn_roidb
diff --git a/example/rcnn/tools/load_model.py b/example/rcnn/utils/load_model.py
similarity index 97%
rename from example/rcnn/tools/load_model.py
rename to example/rcnn/utils/load_model.py
index bd5a28ea23ef..c767661232e7 100644
--- a/example/rcnn/tools/load_model.py
+++ b/example/rcnn/utils/load_model.py
@@ -47,7 +47,8 @@ def load_param(prefix, epoch, convert=False, ctx=None):
     """
     arg_params, aux_params = load_checkpoint(prefix, epoch)
     if convert:
-        assert ctx is not None
+        if ctx is None:
+            ctx = mx.cpu()
         arg_params = convert_context(arg_params, ctx)
         aux_params = convert_context(aux_params, ctx)
     return arg_params, aux_params
diff --git a/example/rcnn/tools/save_model.py b/example/rcnn/utils/save_model.py
similarity index 100%
rename from example/rcnn/tools/save_model.py
rename to example/rcnn/utils/save_model.py
diff --git a/example/rnn/README.md b/example/rnn/README.md
index c3b6e225add8..294e7726268e 100644
--- a/example/rnn/README.md
+++ b/example/rnn/README.md
@@ -10,11 +10,6 @@ This folder contains RNN examples using low level symbol interface.
 - [gru_bucketing.py](gru_bucketing.py) PennTreeBank language model by using GRU
 - [char-rnn.ipynb](char-rnn.ipynb) Notebook to demo how to train a character LSTM by using ```lstm.py```
 
-## R
-
-- [lstm.R](lstm.R) Functions for building a LSTM Network
-- [char_lstm.R](char_lstm.R) demo how to train a character LSTM by using ```lstm.R```
-
 
 Performance Note:
 More ```MXNET_GPU_WORKER_NTHREADS``` may lead to better performance. For setting ```MXNET_GPU_WORKER_NTHREADS```, please refer to [Environment Variables](https://mxnet.readthedocs.org/en/latest/how_to/env_var.html).
diff --git a/example/rnn/char-rnn.ipynb b/example/rnn/char-rnn.ipynb
index cefe60b96995..4ad18815be02 100644
--- a/example/rnn/char-rnn.ipynb
+++ b/example/rnn/char-rnn.ipynb
@@ -43,14 +43,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<img src=\"http://webdocs.cs.ualberta.ca/~bx3/char-rnn_1.png\">\n"
+    "<img src=\"http://data.dmlc.ml/mxnet/data/char-rnn_1.png\">\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "<img src=\"http://webdocs.cs.ualberta.ca/~bx3/char-rnn_2.png\">"
+    "<img src=\"http://data.dmlc.ml/mxnet/data/char-rnn_2.png\">"
    ]
   },
   {
@@ -145,7 +145,7 @@
    ],
    "source": [
     "import os\n",
-    "data_url = \"http://webdocs.cs.ualberta.ca/~bx3/lab_data.zip\"\n",
+    "data_url = \"http://data.dmlc.ml/mxnet/data/lab_data.zip\"\n",
     "os.system(\"wget %s\" % data_url)\n",
     "os.system(\"unzip -o lab_data.zip\")"
    ]
diff --git a/example/svm_mnist/README.md b/example/svm_mnist/README.md
new file mode 100644
index 000000000000..082c2053f27e
--- /dev/null
+++ b/example/svm_mnist/README.md
@@ -0,0 +1,11 @@
+# Use case with Support Vector Machine
+
+To ensure that not only the implementation is learning, but is able to outsmart the softmax, as [this article](arxiv.org/pdf/1306.0239.pdf) suggests, I ran svm_mnist.py script. It was based on the MNIST experiment description on the article and [this tutorial](https://github.com/dmlc/mxnet-gtc-tutorial/blob/master/tutorial.ipynb).
+
+
+## To this you will need
+
+* [Numpy](http://www.scipy.org/scipylib/download.html)
+* [Sklearn](http://scikit-learn.org/stable/install.html)
+
+I recommend installing [matplot](http://matplotlib.org/users/installing.html) to visualize examples
\ No newline at end of file
diff --git a/example/svm_mnist/svm_mnist.py b/example/svm_mnist/svm_mnist.py
new file mode 100644
index 000000000000..f36a0457616f
--- /dev/null
+++ b/example/svm_mnist/svm_mnist.py
@@ -0,0 +1,84 @@
+
+#############################################################
+## Please read the README.md document for better reference ##
+#############################################################
+
+import mxnet as mx
+import numpy as np
+from sklearn.datasets import fetch_mldata
+from sklearn.decomposition import PCA
+# import matplotlib.pyplot as plt
+import logging
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+# Network declaration as symbols. The following pattern was based
+# on the article, but feel free to play with the number of nodes
+# and with the activation function
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=512)
+act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 512)
+act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
+
+# Here we add the ultimate layer based on L2-SVM objective
+mlp = mx.symbol.SVMOutput(data=fc3, name='svm')
+
+# To use L1-SVM objective, comment the line above and uncomment the line below
+# mlp = mx.symbol.SVMOutput(data=fc3, name='svm', use_linear=True)
+
+# Now we fetch MNIST dataset, add some noise, as the article suggests,
+# permutate and assign the examples to be used on our network
+mnist = fetch_mldata('MNIST original')
+mnist_pca = PCA(n_components=70).fit_transform(mnist.data)
+noise = np.random.normal(size=mnist_pca.shape)
+mnist_pca += noise
+np.random.seed(1234) # set seed for deterministic ordering
+p = np.random.permutation(mnist_pca.shape[0])
+X = mnist_pca[p]
+Y = mnist.target[p]
+X_show = mnist.data[p]
+
+# This is just to normalize the input to a value inside [0,1],
+# and separate train set and test set
+X = X.astype(np.float32)/255
+X_train = X[:60000]
+X_test = X[60000:]
+X_show = X_show[60000:]
+Y_train = Y[:60000]
+Y_test = Y[60000:]
+
+# Article's suggestion on batch size
+batch_size = 200
+train_iter = mx.io.NDArrayIter(X_train, Y_train, batch_size=batch_size)
+test_iter = mx.io.NDArrayIter(X_test, Y_test, batch_size=batch_size)
+
+# A quick work around to prevent mxnet complaining the lack of a softmax_label
+train_iter.label =  mx.io._init_data(Y_train, allow_empty=True, default_name='svm_label')
+test_iter.label =  mx.io._init_data(Y_test, allow_empty=True, default_name='svm_label')
+
+# Here we instatiate and fit the model for our data
+# The article actually suggests using 400 epochs,
+# But I reduced to 10, for convinience
+model = mx.model.FeedForward(
+    ctx = mx.cpu(0),      # Run on CPU 0
+    symbol = mlp,         # Use the network we just defined
+    num_epoch = 10,       # Train for 10 epochs
+    learning_rate = 0.1,  # Learning rate
+    momentum = 0.9,       # Momentum for SGD with momentum
+    wd = 0.00001,         # Weight decay for regularization
+    )
+model.fit(
+    X=train_iter,  # Training data set
+    eval_data=test_iter,  # Testing data set. MXNet computes scores on test set every epoch
+    batch_end_callback = mx.callback.Speedometer(batch_size, 200))  # Logging module to print out progress
+
+# Uncomment to view an example
+# plt.imshow((X_show[0].reshape((28,28))*255).astype(np.uint8), cmap='Greys_r')
+# plt.show()
+# print 'Result:', model.predict(X_test[0:1])[0].argmax()
+
+# Now it prints how good did the network did for this configuration
+print 'Accuracy:', model.score(test_iter)*100, '%'
\ No newline at end of file
diff --git a/example/warpctc/README.md b/example/warpctc/README.md
new file mode 100644
index 000000000000..32306aa157bd
--- /dev/null
+++ b/example/warpctc/README.md
@@ -0,0 +1,91 @@
+# Baidu Warp CTC with Mxnet
+
+Baidu-warpctc is a CTC implement by Baidu which support GPU. CTC can be used with LSTM to solve lable alignment problems in many areas such as OCR, speech recognition.
+
+## Install baidu warpctc
+
+```
+  cd ~/
+  git clone https://github.com/baidu-research/warp-ctc
+  cd warp-ctc
+  mkdir build
+  cd build
+  cmake ..
+  make
+  sudo make install
+```
+
+## Enable warpctc in mxnet
+
+```
+  comment out following lines in make/config.mk
+  WARPCTC_PATH = $(HOME)/warp-ctc
+  MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+  
+  rebuild mxnet by
+  make clean && make -j4
+```
+
+## Run examples
+
+I implement two examples, one is just a toy example which can be used to prove ctc integration is right. The second is a OCR example with LSTM+CTC. You can run it by:
+
+```
+  cd examples/warpctc
+  python lstm_ocr.py
+```
+
+The OCR example is constructed as follows:
+  
+1. I generate 80x30 image for 4 digits captcha by an python captcha library
+2. The 80x30 image is used as 80 input for lstm and every input is one column of image (a 30 dim vector)
+3. The output layer use CTC loss
+
+Following code show detail construction of the net:
+
+```
+  def lstm_unroll(num_lstm_layer, seq_len,
+                  num_hidden, num_label):
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
+                                     i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
+                                     h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
+                                     h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert(len(last_states) == num_lstm_layer)
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    
+    #every column of image is an input, there are seq_len inputs
+    wordvec = mx.sym.SliceChannel(data=data, num_outputs=seq_len, squeeze_axis=1)
+    hidden_all = []
+    for seqidx in range(seq_len):
+        hidden = wordvec[seqidx]
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i)
+            hidden = next_state.h
+            last_states[i] = next_state
+        hidden_all.append(hidden)
+    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
+    pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
+    
+    # here we do NOT need to transpose label as other lstm examples do
+    label = mx.sym.Reshape(data=label, target_shape=(0,))
+    #label should be int type, so use cast
+    label = mx.sym.Cast(data = label, dtype = 'int32')
+    sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)
+    return sm
+```
+  
+## Support multi label length
+
+If you label length is smalled than or equal to b. You should provide labels with length b, and for those samples which label length is smaller than b, you should append 0 to label data to make it have length b.
+
+Here, 0 is reserved for blank label.
diff --git a/example/warpctc/lstm.py b/example/warpctc/lstm.py
new file mode 100644
index 000000000000..32ba2455e11d
--- /dev/null
+++ b/example/warpctc/lstm.py
@@ -0,0 +1,79 @@
+# pylint:skip-file
+import sys
+sys.path.insert(0, "../../python")
+import mxnet as mx
+import numpy as np
+from collections import namedtuple
+import time
+import math
+LSTMState = namedtuple("LSTMState", ["c", "h"])
+LSTMParam = namedtuple("LSTMParam", ["i2h_weight", "i2h_bias",
+                                     "h2h_weight", "h2h_bias"])
+LSTMModel = namedtuple("LSTMModel", ["rnn_exec", "symbol",
+                                     "init_states", "last_states",
+                                     "seq_data", "seq_labels", "seq_outputs",
+                                     "param_blocks"])
+
+def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx):
+    """LSTM Cell symbol"""
+    i2h = mx.sym.FullyConnected(data=indata,
+                                weight=param.i2h_weight,
+                                bias=param.i2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_i2h" % (seqidx, layeridx))
+    h2h = mx.sym.FullyConnected(data=prev_state.h,
+                                weight=param.h2h_weight,
+                                bias=param.h2h_bias,
+                                num_hidden=num_hidden * 4,
+                                name="t%d_l%d_h2h" % (seqidx, layeridx))
+    gates = i2h + h2h
+    slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
+                                      name="t%d_l%d_slice" % (seqidx, layeridx))
+    in_gate = mx.sym.Activation(slice_gates[0], act_type="sigmoid")
+    in_transform = mx.sym.Activation(slice_gates[1], act_type="tanh")
+    forget_gate = mx.sym.Activation(slice_gates[2], act_type="sigmoid")
+    out_gate = mx.sym.Activation(slice_gates[3], act_type="sigmoid")
+    next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
+    next_h = out_gate * mx.sym.Activation(next_c, act_type="tanh")
+    return LSTMState(c=next_c, h=next_h)
+
+
+def lstm_unroll(num_lstm_layer, seq_len,
+                num_hidden, num_label):
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
+                                     i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
+                                     h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
+                                     h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert(len(last_states) == num_lstm_layer)
+
+    # embeding layer
+    data = mx.sym.Variable('data')
+    label = mx.sym.Variable('label')
+    wordvec = mx.sym.SliceChannel(data=data, num_outputs=seq_len, squeeze_axis=1)
+
+    hidden_all = []
+    for seqidx in range(seq_len):
+        hidden = wordvec[seqidx]
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i)
+            hidden = next_state.h
+            last_states[i] = next_state
+        hidden_all.append(hidden)
+
+    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
+    pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
+
+    label = mx.sym.Reshape(data=label, shape=(-1,))
+    label = mx.sym.Cast(data = label, dtype = 'int32')
+    sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)
+    return sm
+
diff --git a/example/warpctc/lstm_ocr.py b/example/warpctc/lstm_ocr.py
new file mode 100644
index 000000000000..048572500b85
--- /dev/null
+++ b/example/warpctc/lstm_ocr.py
@@ -0,0 +1,176 @@
+# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+import sys, random
+sys.path.insert(0, "../../python")
+import numpy as np
+import mxnet as mx
+
+from lstm import lstm_unroll
+
+from io import BytesIO
+from captcha.image import ImageCaptcha
+import cv2, random
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+        self.pad = 0
+        self.index = None # TODO: what is index?
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+def gen_rand():
+    buf = ""
+    max_len = random.randint(3,4)
+    for i in range(max_len):
+        buf += str(random.randint(0,9))
+    return buf
+
+def get_label(buf):
+    ret = np.zeros(4)
+    for i in range(len(buf)):
+        ret[i] = 1 + int(buf[i])
+    if len(buf) == 3:
+        ret[3] = 0
+    return ret
+
+class OCRIter(mx.io.DataIter):
+    def __init__(self, count, batch_size, num_label, init_states):
+        super(OCRIter, self).__init__()
+        self.captcha = ImageCaptcha(fonts=['./data/Xerox.ttf'])
+        self.batch_size = batch_size
+        self.count = count
+        self.num_label = num_label
+        self.init_states = init_states
+        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
+        self.provide_data = [('data', (batch_size, 2400))] + init_states
+        self.provide_label = [('label', (self.batch_size, 4))]
+
+    def __iter__(self):
+        print 'iter'
+        init_state_names = [x[0] for x in self.init_states]
+        for k in range(self.count):
+            data = []
+            label = []
+            for i in range(self.batch_size):
+                num = gen_rand()
+                img = self.captcha.generate(num)
+                img = np.fromstring(img.getvalue(), dtype='uint8')
+                img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
+                img = cv2.resize(img, (80, 30))
+                img = img.transpose(1, 0)
+                img = img.reshape((80 * 30))
+                img = np.multiply(img, 1/255.0)
+                data.append(img)
+                label.append(get_label(num))
+
+            data_all = [mx.nd.array(data)] + self.init_state_arrays
+            label_all = [mx.nd.array(label)]
+            data_names = ['data'] + init_state_names
+            label_names = ['label']
+            
+            
+            data_batch = SimpleBatch(data_names, data_all, label_names, label_all)
+            yield data_batch
+
+    def reset(self):
+        pass
+
+BATCH_SIZE = 32
+SEQ_LENGTH = 80
+
+def ctc_label(p):
+    ret = []
+    p1 = [0] + p
+    for i in range(len(p)):
+        c1 = p1[i]
+        c2 = p1[i+1]
+        if c2 == 0 or c2 == c1:
+            continue
+        ret.append(c2)
+    return ret
+
+def remove_blank(l):
+    ret = []
+    for i in range(len(l)):
+        if l[i] == 0:
+            break
+        ret.append(l[i])
+    return ret
+
+def Accuracy(label, pred):
+    global BATCH_SIZE
+    global SEQ_LENGTH
+    hit = 0.
+    total = 0.
+    for i in range(BATCH_SIZE):
+        l = remove_blank(label[i])
+        p = []
+        for k in range(SEQ_LENGTH):
+            p.append(np.argmax(pred[k * BATCH_SIZE + i]))
+        p = ctc_label(p)
+        if len(p) == len(l):
+            match = True
+            for k in range(len(p)):
+                if p[k] != int(l[k]):
+                    match = False
+                    break
+            if match:
+                hit += 1.0
+        total += 1.0
+    return hit / total
+
+if __name__ == '__main__':
+    num_hidden = 100
+    num_lstm_layer = 2
+
+    num_epoch = 10
+    learning_rate = 0.001
+    momentum = 0.9
+    num_label = 4
+
+    contexts = [mx.context.gpu(1)]
+
+    def sym_gen(seq_len):
+        return lstm_unroll(num_lstm_layer, seq_len,
+                           num_hidden=num_hidden,
+                           num_label = num_label)
+
+    init_c = [('l%d_init_c'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_h = [('l%d_init_h'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_states = init_c + init_h
+
+    data_train = OCRIter(10000, BATCH_SIZE, num_label, init_states)
+    data_val = OCRIter(1000, BATCH_SIZE, num_label, init_states)
+
+    symbol = sym_gen(SEQ_LENGTH)
+
+    model = mx.model.FeedForward(ctx=contexts,
+                                 symbol=symbol,
+                                 num_epoch=num_epoch,
+                                 learning_rate=learning_rate,
+                                 momentum=momentum,
+                                 wd=0.00001,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    print 'begin fit'
+
+    model.fit(X=data_train, eval_data=data_val,
+              eval_metric = mx.metric.np(Accuracy),
+              batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 50),)
+
+    model.save("ocr")
diff --git a/example/warpctc/toy_ctc.py b/example/warpctc/toy_ctc.py
new file mode 100644
index 000000000000..2caa11e68399
--- /dev/null
+++ b/example/warpctc/toy_ctc.py
@@ -0,0 +1,163 @@
+# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+import sys
+sys.path.insert(0, "../../python")
+import numpy as np
+import mxnet as mx
+import random
+from lstm import lstm_unroll
+
+class SimpleBatch(object):
+    def __init__(self, data_names, data, label_names, label):
+        self.data = data
+        self.label = label
+        self.data_names = data_names
+        self.label_names = label_names
+
+        self.pad = 0
+        self.index = None # TODO: what is index?
+
+    @property
+    def provide_data(self):
+        return [(n, x.shape) for n, x in zip(self.data_names, self.data)]
+
+    @property
+    def provide_label(self):
+        return [(n, x.shape) for n, x in zip(self.label_names, self.label)]
+
+def gen_feature(n):
+    ret = np.zeros(10)
+    ret[n] = 1
+    return ret
+
+def gen_rand():
+    num = random.randint(0, 9999)
+    buf = str(num)
+    while len(buf) < 4:
+        buf = "0" + buf
+    ret = np.array([])
+    for i in range(80):
+        c = int(buf[i / 20])
+        ret = np.concatenate([ret, gen_feature(c)])
+    return buf, ret
+
+def get_label(buf):
+    ret = np.zeros(4)
+    for i in range(4):
+        ret[i] = 1 + int(buf[i])
+    return ret
+
+class DataIter(mx.io.DataIter):
+    def __init__(self, count, batch_size, num_label, init_states):
+        super(DataIter, self).__init__()
+        self.batch_size = batch_size
+        self.count = count
+        self.num_label = num_label
+        self.init_states = init_states
+        self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
+        self.provide_data = [('data', (batch_size, 10 * 80))] + init_states
+        self.provide_label = [('label', (self.batch_size, 4))]
+
+    def __iter__(self):
+        init_state_names = [x[0] for x in self.init_states]
+        for k in range(self.count):
+            data = []
+            label = []
+            for i in range(self.batch_size):
+                num, img = gen_rand()
+                data.append(img)
+                label.append(get_label(num))
+                
+            data_all = [mx.nd.array(data)] + self.init_state_arrays
+            label_all = [mx.nd.array(label)]
+            data_names = ['data'] + init_state_names
+            label_names = ['label']
+            
+            
+            data_batch = SimpleBatch(data_names, data_all, label_names, label_all)
+            yield data_batch
+
+    def reset(self):
+        pass
+
+BATCH_SIZE = 32
+SEQ_LENGTH = 80
+
+def ctc_label(p):
+    ret = []
+    p1 = [0] + p
+    for i in range(len(p)):
+        c1 = p1[i]
+        c2 = p1[i+1]
+        if c2 == 0 or c2 == c1:
+            continue
+        ret.append(c2)
+    return ret
+        
+
+def Accuracy(label, pred):
+    global BATCH_SIZE
+    global SEQ_LENGTH
+    hit = 0.
+    total = 0.
+    for i in range(BATCH_SIZE):
+        l = label[i]
+        p = []
+        for k in range(SEQ_LENGTH):
+            p.append(np.argmax(pred[k * BATCH_SIZE + i]))
+        p = ctc_label(p)
+        if len(p) == len(l):
+            match = True
+            for k in range(len(p)):
+                if p[k] != int(l[k]):
+                    match = False
+                    break
+            if match:
+                hit += 1.0
+        total += 1.0
+    return hit / total
+
+if __name__ == '__main__':
+    num_hidden = 100
+    num_lstm_layer = 1
+
+    num_epoch = 10
+    learning_rate = 0.001
+    momentum = 0.9
+    num_label = 4
+
+    contexts = [mx.context.gpu(0)]
+
+    def sym_gen(seq_len):
+        return lstm_unroll(num_lstm_layer, seq_len,
+                           num_hidden=num_hidden,
+                           num_label = num_label)
+
+    init_c = [('l%d_init_c'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_h = [('l%d_init_h'%l, (BATCH_SIZE, num_hidden)) for l in range(num_lstm_layer)]
+    init_states = init_c + init_h
+
+    data_train = DataIter(100000, BATCH_SIZE, num_label, init_states)
+    data_val = DataIter(1000, BATCH_SIZE, num_label, init_states)
+
+    symbol = sym_gen(SEQ_LENGTH)
+
+    model = mx.model.FeedForward(ctx=contexts,
+                                 symbol=symbol,
+                                 num_epoch=num_epoch,
+                                 learning_rate=learning_rate,
+                                 momentum=momentum,
+                                 wd=0.00001,
+                                 initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.DEBUG, format=head)
+    
+    print 'begin fit'
+
+    model.fit(X=data_train, eval_data=data_val,
+              eval_metric = mx.metric.np(Accuracy),
+              batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 50),)
+
+    model.save("ocr")
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 097e3eb603bd..28bc89406c0b 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -479,6 +479,14 @@ MXNET_DLL int MXFuncInvokeEx(FunctionHandle fun,
  */
 MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
                                                AtomicSymbolCreator **out_array);
+
+/*!
+ * \brief Get the name of an atomic symbol.
+ * \param creator the AtomicSymbolCreator.
+ * \param name The returned name of the creator.
+ */
+MXNET_DLL int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator creator,
+                                          const char **name);
 /*!
  * \brief Get the detailed information about atomic symbol.
  * \param creator the AtomicSymbolCreator.
diff --git a/include/mxnet/mxrtc.h b/include/mxnet/mxrtc.h
index a45badb1d3dc..9de59f63da2a 100644
--- a/include/mxnet/mxrtc.h
+++ b/include/mxnet/mxrtc.h
@@ -60,7 +60,7 @@ class MXRtc {
             unsigned int  block_dim_Z);
 
  private:
-  static const std::string str_type;
+  static const char str_type[];
   static std::unordered_map<std::string, char*> kernel_registry;
 
   std::string name_;
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 3649b36e0e05..e4f15082b398 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -65,11 +65,12 @@ class NDArray {
    * \return the data TBlob
    */
   inline TBlob data() const {
+    TBlob res;
     MSHADOW_TYPE_SWITCH(dtype_, DType, {
-      return TBlob(static_cast<DType*>(ptr_->shandle.dptr)
+      res = TBlob(static_cast<DType*>(ptr_->shandle.dptr)
         + offset_, shape_, ptr_->shandle.ctx.dev_mask());
     });
-    return TBlob();
+    return res;
   }
   /*!
    * \return the context of NDArray, this function is only valid when the NDArray is not empty
diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index f96b85108b47..71276a4bec5f 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -11,6 +11,10 @@
 #ifndef MXNET_OPERATOR_UTIL_H_
 #define MXNET_OPERATOR_UTIL_H_
 
+#ifdef _MSC_VER
+#pragma warning(disable:4503)  // disable warning: decorated name length exceeded.
+#endif
+
 #include <dmlc/registry.h>
 #include <dmlc/parameter.h>
 #include <map>
@@ -56,6 +60,26 @@ struct EnvArguments {
   std::vector<Resource> resource;
 };
 
+/*!
+ * \brief source function that generate output based on env
+ *  The result container is pre-allocated with the correct shape.
+ * \param env The Environment arguments.
+ * \param ret The containter to store return value.
+ * \param req The requirement to stroe the ret.
+ * \param ctx Runtime context to execute the function.
+ */
+typedef void (*SourceFunction)(const EnvArguments& env,
+                               TBlob* ret,
+                               OpReqType req,
+                               RunContext ctx);
+
+/*!
+ * \brief Shape inference function to get the correct shape.
+ * \param env The Environment arguments.
+ * \return The inferred result shape.
+ */
+typedef TShape (*SourceShapeFunction)(const EnvArguments& env);
+
 /*!
  * \brief Unary function that takes a src and save result to ret.
  *  The result container is pre-allocated with the correct shape.
@@ -261,6 +285,11 @@ class SimpleOpRegEntry {
    * \param req the request.
    */
   virtual TSelf& set_resource_request(ResourceRequest req) = 0;
+  /*!
+   * \brief set source inference function.
+   * \param fshapeinfer The source function that peforms the operation.
+   */
+  virtual TSelf& set_shape_function(SourceShapeFunction fshapeinfer) = 0;
   /*!
    * \brief set shape inference function.
    *  Default: out_shape = in_shape
@@ -273,6 +302,16 @@ class SimpleOpRegEntry {
    * \param fshapeinfer The binary function that peforms the operation.
    */
   virtual TSelf& set_shape_function(BinaryShapeFunction fshapeinfer) = 0;
+  /*!
+   * \brief set function of the function to be fsource
+   * \param dev_mask The device mask of the function can act on.
+   * \param fsource The unary function that peforms the operation.
+   * \param register_symbolic Whether register a symbolic operator as well.
+   */
+  virtual TSelf& set_function(
+      int dev_mask,
+      SourceFunction fsource,
+      SimpleOpRegOption register_symbolic = kRegisterSymbolic) = 0;
   /*!
    * \brief set function of the function to be funary
    * \param dev_mask The device mask of the function can act on.
@@ -412,47 +451,9 @@ class SimpleOpRegistry {
   }
 
 /*!
-* \brief cast dynamic range variable into static variable
-* \param var the source value, constrained to be between 1 and 5
-* \param NDIM the const NDIM that can be used in the template
+* \brief Maximum ndim supported for special operators like broadcasting with non contiguous lhs/rhs
 */
-#define MXNET_RANGE_SWITCH(var, NDIM, ...)         \
-  {                                                \
-    switch (var) {                                 \
-      case 1:                                      \
-        {                                          \
-          static const int NDIM = 1;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 2:                                      \
-        {                                          \
-          static const int NDIM = 2;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 3:                                      \
-        {                                          \
-          static const int NDIM = 3;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 4:                                      \
-        {                                          \
-          static const int NDIM = 4;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      case 5:                                      \
-        {                                          \
-          static const int NDIM = 5;               \
-          {__VA_ARGS__}                            \
-        }                                          \
-        break;                                     \
-      default:                                     \
-        LOG(FATAL) << "Only support ndim=1 to 5."; \
-    }                                              \
-  }
+#define MXNET_SPECIAL_MAX_NDIM 7
 
 
 //--------------------------------------------------------------
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 31c380dd8503..da41cb07e52d 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -74,6 +74,10 @@ struct Resource {
    * \brief Get space requested as mshadow Tensor.
    *  The caller can request arbitrary size.
    *
+   *  This space can be shared with other calls to this->get_space.
+   *  So the caller need to serialize the calls when using the conflicted space.
+   *  The temp space will remain valid until release is called.
+   *
    * \param shape the Shape of returning tensor.
    * \param stream the stream of retruning tensor.
    * \return the mshadow tensor requested.
@@ -132,6 +136,16 @@ struct Resource {
         reinterpret_cast<DType*>(get_host_space_internal(shape.Size() * sizeof(DType))),
         shape, shape[ndim - 1], NULL);
   }
+  /*!
+   * \brief Release the all existing allocated space.
+   *  The existing allocated address will remain valdd
+   *  until release is called.
+   *
+   *  Even if user do not call release, the space occupation
+   *  of the resource will remain at most two times of maximum
+   *  requested space.
+   */
+  void release() const;
   /*!
    * \brief internal function to get space from resources.
    * \param size The size of the space.
diff --git a/make/config.mk b/make/config.mk
index bbd19e56b5d7..aa3986a21673 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -116,6 +116,9 @@ EXTRA_OPERATORS =
 # TORCH_PATH = $(HOME)/torch
 # MXNET_PLUGINS += plugin/torch/torch.mk
 
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
 # whether to use sframe integration. This requires build sframe
 # git@github.com:dato-code/SFrame.git
 # SFRAME_PATH = $(HOME)/SFrame
diff --git a/mshadow b/mshadow
index 310e015e5c65..44d61f8ef9d8 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 310e015e5c65bd5314e57fc79ceb06b162547325
+Subproject commit 44d61f8ef9d86e85e7bc62b2a1d4dc40554672f1
diff --git a/plugin/opencv/__init__.py b/plugin/opencv/__init__.py
new file mode 100644
index 000000000000..072575177e41
--- /dev/null
+++ b/plugin/opencv/__init__.py
@@ -0,0 +1,6 @@
+# coding: utf-8
+# pylint: disable=wildcard-import
+
+"""Opencv plugin for mxnet"""
+from .opencv import *
+
diff --git a/plugin/opencv/cv_api.cc b/plugin/opencv/cv_api.cc
new file mode 100644
index 000000000000..78bec01548d4
--- /dev/null
+++ b/plugin/opencv/cv_api.cc
@@ -0,0 +1,149 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file cv_api.h
+ * \brief C API for opencv
+ * \author Junyuan Xie
+ */
+#include <dmlc/base.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <opencv2/opencv.hpp>
+#include "cv_api.h"
+#include "../../src/c_api/c_api_error.h"
+
+
+using namespace mxnet;
+// http://www.64lines.com/jpeg-width-height
+// Gets the JPEG size from the array of data passed to the function, file reference: http://www.obrador.com/essentialjpeg/headerinfo.htm
+bool get_jpeg_size(const unsigned char* data, mx_uint data_size, mx_uint *width, mx_uint *height) {
+  // Check for valid JPEG image
+  mx_uint i = 0;  // Keeps track of the position within the file
+  if (data[i] == 0xFF && data[i+1] == 0xD8 && data[i+2] == 0xFF && data[i+3] == 0xE0) {
+    i += 4;
+    // Check for valid JPEG header (null terminated JFIF)
+    if (data[i+2] == 'J' && data[i+3] == 'F' && data[i+4] == 'I'
+        && data[i+5] == 'F' && data[i+6] == 0x00) {
+      // Retrieve the block length of the first block since
+      // the first block will not contain the size of file
+      uint16_t block_length = data[i] * 256 + data[i+1];
+      while (i < data_size) {
+        i+=block_length;  // Increase the file index to get to the next block
+        if (i >= data_size) return false;  // Check to protect against segmentation faults
+        if (data[i] != 0xFF) return false;  // Check that we are truly at the start of another block
+        if (data[i+1] == 0xC0) {
+          // 0xFFC0 is the "Start of frame" marker which contains the file size
+          // The structure of the 0xFFC0 block is quite simple
+          // [0xFFC0][ushort length][uchar precision][ushort x][ushort y]
+          *height = data[i+5]*256 + data[i+6];
+          *width = data[i+7]*256 + data[i+8];
+          return true;
+        } else {
+          i+=2;  // Skip the block marker
+          block_length = data[i] * 256 + data[i+1];  // Go to the next block
+        }
+      }
+      return false;  // If this point is reached then no size was found
+    } else {
+      return false;  // Not a valid JFIF string
+    }
+  } else {
+    return false;  // Not a valid SOI header
+  }
+}
+
+bool get_png_size(const unsigned char* data, mx_uint data_size, mx_uint *width, mx_uint *height) {
+  if (data[0] == 0x89 && data[1] == 0x50 && data[2] ==0x4E && data[3] == 0x47) {
+    unsigned char const* p = data + 16;
+    *width = ((p[0]*256 + p[1])*256 + p[2])*256 + p[3];
+    p += 4;
+    *height = ((p[0]*256 + p[1])*256 + p[2])*256 + p[3];
+    return true;
+  } else {
+    return false;
+  }
+}
+
+MXNET_DLL int MXCVImdecode(const unsigned char *img, const mx_uint len,
+                           const int flag, NDArrayHandle *out) {
+  API_BEGIN();
+  mx_uint dims[3];
+  CHECK_GE(flag, 0) << "flag must be 0 (grayscale) or 1 (colored).";
+  dims[2] = flag == 0 ? 1 : 3;
+  if (get_jpeg_size(img, len, dims+1, dims)) {
+  } else if (get_png_size(img, len, dims+1, dims)) {
+  } else {
+    LOG(FATAL) << "Only supports png and jpg.";
+  }
+  NDArray ndout(TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+  unsigned char *img_cpy = new unsigned char[len];
+  memcpy(img_cpy, img, sizeof(unsigned char)*len);
+  Engine::Get()->PushSync([=](RunContext ctx){
+      ndout.CheckAndAlloc();
+      cv::Mat buf(1, len, CV_8U, img_cpy);
+      cv::Mat dst(dims[0], dims[1], flag == 0 ? CV_8U : CV_8UC3, ndout.data().dptr_);
+      cv::imdecode(buf, flag, &dst);
+      CHECK(!dst.empty());
+      delete[] img_cpy;
+    }, ndout.ctx(), {}, {ndout.var()});
+  NDArray *tmp = new NDArray();
+  *tmp = ndout;
+  *out = tmp;
+  API_END();
+}
+
+
+MXNET_DLL int MXCVResize(NDArrayHandle src, const mx_uint w, const mx_uint h,
+                         const int interpolation, NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray ndsrc = *static_cast<NDArray*>(src);
+  CHECK_EQ(ndsrc.shape().ndim(), 3);
+  CHECK_EQ(ndsrc.ctx(), Context::CPU());
+  CHECK_EQ(ndsrc.dtype(), mshadow::kUint8);
+
+  mx_uint dims[3] = {h, w, ndsrc.shape()[2]};
+  NDArray ndout(TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+
+  Engine::Get()->PushSync([=](RunContext ctx){
+      ndout.CheckAndAlloc();
+      cv::Mat buf(ndsrc.shape()[0], ndsrc.shape()[1],
+                  dims[2] == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
+      cv::Mat dst(h, w, dims[2] == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
+      cv::resize(buf, dst, cv::Size(w, h), 0, 0, interpolation);
+      CHECK(!dst.empty());
+    }, ndout.ctx(), {ndsrc.var()}, {ndout.var()});
+  NDArray *tmp = new NDArray();
+  *tmp = ndout;
+  *out = tmp;
+  API_END();
+}
+
+MXNET_DLL int MXCVcopyMakeBorder(NDArrayHandle src,
+                                 const int top,
+                                 const int bot,
+                                 const int left,
+                                 const int right,
+                                 const int type,
+                                 const double value,
+                                 NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray ndsrc = *static_cast<NDArray*>(src);
+  CHECK_EQ(ndsrc.shape().ndim(), 3);
+  CHECK_EQ(ndsrc.ctx(), Context::CPU());
+  CHECK_EQ(ndsrc.dtype(), mshadow::kUint8);
+
+  int h = ndsrc.shape()[0], w = ndsrc.shape()[1], c = ndsrc.shape()[2];
+  mx_uint dims[3] = {top+h+bot, left+w+right, c};
+  NDArray ndout(TShape(dims, dims+3), Context::CPU(), true, mshadow::kUint8);
+
+  Engine::Get()->PushSync([=](RunContext ctx){
+      ndout.CheckAndAlloc();
+      cv::Mat buf(h, w, c == 3 ? CV_8UC3 : CV_8U, ndsrc.data().dptr_);
+      cv::Mat dst(top+h+bot, left+w+right, c == 3 ? CV_8UC3 : CV_8U, ndout.data().dptr_);
+      cv::copyMakeBorder(buf, dst, top, bot, left, right, type, cv::Scalar(value));
+      CHECK(!dst.empty());
+    }, ndout.ctx(), {ndsrc.var()}, {ndout.var()});
+  NDArray *tmp = new NDArray();
+  *tmp = ndout;
+  *out = tmp;
+  API_END();
+}
diff --git a/plugin/opencv/cv_api.h b/plugin/opencv/cv_api.h
new file mode 100644
index 000000000000..fc224d0e1d05
--- /dev/null
+++ b/plugin/opencv/cv_api.h
@@ -0,0 +1,35 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file cv_api.h
+ * \brief C API for opencv
+ * \author Junyuan Xie
+ */
+#ifndef PLUGIN_OPENCV_CV_API_H_
+#define PLUGIN_OPENCV_CV_API_H_
+
+#include <mxnet/c_api.h>
+
+MXNET_DLL int MXCVImdecode(
+  const unsigned char *img,
+  const mx_uint len,
+  const int flag,
+  NDArrayHandle *out);
+
+MXNET_DLL int MXCVResize(
+  NDArrayHandle src,
+  const mx_uint w,
+  const mx_uint h,
+  const int interpolation,
+  NDArrayHandle *out);
+
+MXNET_DLL int MXCVcopyMakeBorder(
+  NDArrayHandle src,
+  const int top,
+  const int bot,
+  const int left,
+  const int right,
+  const int type,
+  const double value,
+  NDArrayHandle *out);
+
+#endif  // PLUGIN_OPENCV_CV_API_H_
diff --git a/plugin/opencv/opencv.mk b/plugin/opencv/opencv.mk
new file mode 100644
index 000000000000..ab1f6ff2ee03
--- /dev/null
+++ b/plugin/opencv/opencv.mk
@@ -0,0 +1,4 @@
+OPENCV_SRC = $(wildcard plugin/opencv/*.cc)
+PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(OPENCV_SRC))
+OPENCV_CUSRC = $(wildcard plugin/opencv/*.cu)
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(OPENCV_CUSRC))
diff --git a/plugin/opencv/opencv.py b/plugin/opencv/opencv.py
new file mode 100644
index 000000000000..6ee5be13f643
--- /dev/null
+++ b/plugin/opencv/opencv.py
@@ -0,0 +1,173 @@
+# coding: utf-8
+# pylint: disable=too-many-arguments,no-member,invalid-name
+
+"""Opencv plugin for mxnet"""
+import random
+import ctypes
+import cv2
+import mxnet as mx
+from mxnet.base import _LIB
+from mxnet.base import mx_uint, NDArrayHandle, check_call
+
+def imdecode(str_img, flag=1):
+    """Decode image from str buffer.
+    Wrapper for cv2.imdecode that uses mx.nd.NDArray
+
+    Parameters
+    ----------
+    str_img : str
+        str buffer read from image file
+    flag : int
+        same as flag for cv2.imdecode
+    Returns
+    -------
+    img : NDArray
+        decoded image in (width, height, channels)
+        with BGR color channel order
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXCVImdecode(ctypes.c_char_p(str_img),
+                                 mx_uint(len(str_img)),
+                                 flag, ctypes.byref(hdl)))
+    return mx.nd.NDArray(hdl)
+
+def resize(src, size, interpolation=cv2.INTER_LINEAR):
+    """Decode image from str buffer.
+    Wrapper for cv2.imresize that uses mx.nd.NDArray
+
+    Parameters
+    ----------
+    src : NDArray
+        image in (width, height, channels)
+    size : tuple
+        target size in (width, height)
+    interpolation : int
+        same as interpolation for cv2.imresize
+
+    Returns
+    -------
+    img : NDArray
+        resized image
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXCVResize(src.handle, mx_uint(size[0]), mx_uint(size[1]),
+                               interpolation, ctypes.byref(hdl)))
+    return mx.nd.NDArray(hdl)
+
+def copyMakeBorder(src, top, bot, left, right, border_type=cv2.BORDER_CONSTANT, value=0):
+    """Pad image border
+    Wrapper for cv2.copyMakeBorder that uses mx.nd.NDArray
+
+    Parameters
+    ----------
+    src : NDArray
+        Image in (width, height, channels).
+        Others are the same with cv2.copyMakeBorder
+
+    Returns
+    -------
+    img : NDArray
+        padded image
+    """
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXCVcopyMakeBorder(src.handle, ctypes.c_int(top), ctypes.c_int(bot),
+                                       ctypes.c_int(left), ctypes.c_int(right),
+                                       ctypes.c_int(border_type), ctypes.c_double(value),
+                                       ctypes.byref(hdl)))
+    return mx.nd.NDArray(hdl)
+
+
+def scale_down(src_size, size):
+    """Scale down crop size if it's bigger than image size"""
+    w, h = size
+    sw, sh = src_size
+    if sh < h:
+        w, h = float(w*sh)/h, sh
+    if sw < w:
+        w, h = sw, float(h*sw)/w
+    return int(w), int(h)
+
+def fixed_crop(src, x0, y0, w, h, size=None, interpolation=cv2.INTER_CUBIC):
+    """Crop src at fixed location, and (optionally) resize it to size"""
+    out = mx.nd.crop(src, begin=(y0, x0, 0), end=(y0+h, x0+w, int(src.shape[2])))
+    if size is not None and (w, h) != size:
+        out = resize(out, size, interpolation=interpolation)
+    return out
+
+def random_crop(src, size):
+    """Randomly crop src with size. Upsample result if src is smaller than size"""
+    h, w, _ = src.shape
+    new_w, new_h = scale_down((w, h), size)
+
+    x0 = random.randint(0, w - new_w)
+    y0 = random.randint(0, h - new_h)
+
+    out = fixed_crop(src, x0, y0, new_w, new_h, size)
+    return out, (x0, y0, new_w, new_h)
+
+def color_normalize(src, mean, std):
+    """Normalize src with mean and std"""
+    src -= mean
+    src /= std
+    return src
+
+def random_size_crop(src, size, min_area=0.25, ratio=(3.0/4.0, 4.0/3.0)):
+    """Randomly crop src with size. Randomize area and aspect ratio"""
+    h, w, _ = src.shape
+    area = w*h
+    for _ in range(10):
+        new_area = random.uniform(min_area, 1.0) * area
+        new_ratio = random.uniform(*ratio)
+        new_w = int(new_area*new_ratio)
+        new_h = int(new_area/new_ratio)
+
+        if random.uniform(0., 1.) < 0.5:
+            new_w, new_h = new_h, new_w
+
+        if new_w > w or new_h > h:
+            continue
+
+        x0 = random.randint(0, w - new_w)
+        y0 = random.randint(0, h - new_h)
+
+        out = fixed_crop(src, x0, y0, new_w, new_h, size)
+        return out, (x0, y0, new_w, new_h)
+
+    return random_crop(src, size)
+
+class ImageListIter(mx.io.DataIter):
+    """An example image iterator using opencv plugin"""
+    def __init__(self, root, flist, batch_size, size, mean=None):
+        super(ImageListIter, self).__init__()
+        self.root = root
+        self.list = [line.strip() for line in open(flist).readlines()]
+        self.cur = 0
+        self.batch_size = batch_size
+        self.size = size
+        if mean is not None:
+            self.mean = mx.nd.array(mean)
+        else:
+            self.mean = None
+
+    def reset(self):
+        self.cur = 0
+
+    def next(self):
+        batch = mx.nd.zeros((self.batch_size, self.size[1], self.size[0], 3))
+        i = self.cur
+        for i in range(self.cur, min(len(self.list), self.cur+self.batch_size)):
+            str_img = open(self.root+self.list[i]+'.jpg').read()
+            img = imdecode(str_img, 1)
+            img, _ = random_crop(img, self.size)
+            batch[i - self.cur] = img
+        batch = mx.nd.transpose(batch, axes=(0, 3, 1, 2))
+        ret = mx.io.DataBatch(data=[batch],
+                              label=[],
+                              pad=self.batch_size-(i-self.cur),
+                              index=None)
+        self.cur = i
+        return ret
+
+
+
+
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
new file mode 100644
index 000000000000..b37132144cb9
--- /dev/null
+++ b/plugin/warpctc/warpctc-inl.h
@@ -0,0 +1,295 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file warpctc-inl.h
+ * \brief warpctc operator
+ * \author Liang Xiang
+*/
+#ifndef PLUGIN_WARPCTC_WARPCTC_INL_H_
+#define PLUGIN_WARPCTC_WARPCTC_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <stdio.h>
+#include <ctc.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include <iostream>
+#include "../../src/operator/operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace warpctc_enum {
+  enum CTCOpInputs {kData, kLabel};
+  enum CTCOpOutputs {kOut};
+}  // namespace warpctc_enum
+
+struct WarpCTCParam : public dmlc::Parameter<WarpCTCParam> {
+  int label_length;
+  int input_length;
+  DMLC_DECLARE_PARAMETER(WarpCTCParam) {
+    DMLC_DECLARE_FIELD(label_length)
+        .set_default(0)
+        .describe("Real label length");
+    DMLC_DECLARE_FIELD(input_length)
+        .set_default(0)
+        .describe("Input length");
+  }
+};
+
+template<typename xpu>
+class WarpCTCOp : public Operator {
+ private:
+  WarpCTCParam param_;
+
+ public:
+  explicit WarpCTCOp(WarpCTCParam p) {
+    this->param_ = p;
+  }
+
+  ~WarpCTCOp() {
+  }
+
+  inline void throw_on_error(ctcStatus_t status, const char* message) {
+    if (status != CTC_STATUS_SUCCESS) {
+      throw std::runtime_error(message
+                               + (", stat = "
+                                  + std::string(ctcGetStatusString(status))));
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2) << "CTCOutput Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "CTCOutput Output: [output]";
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    TBlob data = in_data[warpctc_enum::kData];
+    TBlob out = out_data[warpctc_enum::kOut];
+    Tensor<xpu, 2, float> data_tensor = data.FlatTo2D<xpu, float>(s);
+    Tensor<xpu, 2, float> out_tensor = out.FlatTo2D<xpu, float>(s);
+    Softmax(out_tensor, data_tensor);
+  }
+
+  std::vector<int> labelLengths(const int * flat_labels, int minibatch,
+                                int size, int blank, int * total_length) {
+    CHECK_EQ(param_.label_length * minibatch, size)
+        << "label size should = label_length * minibatch";
+    std::vector<int> ret(minibatch, 0);
+    for (int i = 0; i < size; i++) {
+      if (flat_labels[i] == blank) {
+        continue;
+      }
+      int b = i / param_.label_length;
+      ret[b]++;
+      (*total_length)++;
+    }
+    return ret;
+  }
+
+  void removeBlank(const int * flat_labels, int * cpu_labels,
+                   int size, int blank) {
+    int k = 0;
+    for (int i = 0; i < size; i++) {
+      if (flat_labels[i] != blank) {
+        cpu_labels[k] = flat_labels[i];
+        k += 1;
+      }
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    TBlob data = in_data[warpctc_enum::kData];
+    TBlob label = in_data[warpctc_enum::kLabel];
+    CHECK_EQ(data.shape_.ndim(), 2) << "input data shape should be 2 (t*n, p)";
+    ctcComputeInfo info;
+    if (data.dev_mask_ == cpu::kDevMask) {
+      info.loc = CTC_CPU;
+      info.num_threads = 1;
+    } else if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      info.loc = CTC_GPU;
+      info.stream = ctx.get_stream<gpu>()->stream_;
+#endif
+    } else {
+      LOG(FATAL) << "Unknown device type " << data.dev_mask_;
+    }
+
+    int T = param_.input_length;
+    int minibatch = data.shape_[0] / T;
+    int alphabet_size = data.shape_[1];
+    std::vector<int> input_lengths;
+    for (int i = 0; i < minibatch; i++) {
+      input_lengths.push_back(T);
+    }
+
+#if MXNET_USE_CUDA
+    cudaError_t cuda_status;
+#endif
+    float* activations = static_cast<float*>(data.dptr_);
+    int* flat_labels = static_cast<int*>(label.dptr_);
+    int* cpu_raw_labels = flat_labels;
+    float* grads = static_cast<float*>(in_grad[warpctc_enum::kData].dptr_);
+    if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      cpu_raw_labels = reinterpret_cast<int*>(malloc(sizeof(int) * label.Size()));
+      cuda_status = cudaMemcpyAsync(cpu_raw_labels, flat_labels,
+                                    label.Size()*sizeof(int),
+                                    cudaMemcpyDeviceToHost,
+                                    ctx.get_stream<gpu>()->stream_);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
+#endif
+    }
+
+    int total_label_length = 0;
+    std::vector<int> label_lengths = labelLengths(cpu_raw_labels,
+                                                  minibatch,
+                                                  label.Size(),
+                                                  0, &total_label_length);
+    int* cpu_labels = reinterpret_cast<int*>(
+        malloc(sizeof(int) * total_label_length));
+    removeBlank(cpu_raw_labels, cpu_labels, label.Size(), 0);
+
+    size_t alloc_bytes;
+    throw_on_error(get_workspace_size(label_lengths.data(),
+                                      input_lengths.data(),
+                                      alphabet_size,
+                                      input_lengths.size(), info,
+                                      &alloc_bytes),
+                   "Error: get_workspace_size in inf_test");
+    void* ctc_workspace;
+
+    if (data.dev_mask_ == cpu::kDevMask) {
+      ctc_workspace = malloc(alloc_bytes);
+    } else if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      cuda_status = cudaMalloc(&ctc_workspace, alloc_bytes);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda malloc worksapce fail";
+#endif
+    }
+    std::vector<float> costs(minibatch);
+    throw_on_error(compute_ctc_loss(activations,
+                                    grads,
+                                    cpu_labels,
+                                    label_lengths.data(),
+                                    input_lengths.data(),
+                                    alphabet_size,
+                                    minibatch,
+                                    costs.data(),
+                                    ctc_workspace,
+                                    info),
+                   "Error: compute_ctc_loss");
+
+    if (data.dev_mask_ == cpu::kDevMask) {
+      free(ctc_workspace);
+      free(cpu_labels);
+    } else if (data.dev_mask_ == gpu::kDevMask) {
+#if MXNET_USE_CUDA
+      cuda_status = cudaFree(ctc_workspace);
+      CHECK_EQ(cuda_status, cudaSuccess) << "cuda free workspace fail";
+      free(cpu_raw_labels);
+      free(cpu_labels);
+#endif
+    }
+  }
+};
+
+template<typename xpu>
+Operator* CreateOp(WarpCTCParam type);
+
+
+#if DMLC_USE_CXX11
+class WarpCTCProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  virtual std::vector<std::string> ListOutputs() const {
+    return {"output"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs)
+      override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    TShape label_shape(dshape.ndim() - 1);
+    label_shape[0] = param_.label_length * (dshape[0] / param_.input_length);
+    SHAPE_ASSIGN_CHECK(*in_shape, warpctc_enum::kLabel, label_shape);
+
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  virtual bool InferType(std::vector<int> *in_type,
+                         std::vector<int> *out_type,
+                         std::vector<int> *aux_type) const {
+    CHECK_LE(in_type->size(), this->ListArguments().size());
+    in_type->clear();
+    in_type->push_back(mshadow::kFloat32);
+    in_type->push_back(mshadow::kInt32);
+    out_type->clear();
+    out_type->push_back(mshadow::kFloat32);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new WarpCTCProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "WarpCTC";
+  }
+
+
+  std::vector<int> DeclareBackwardDependency(const std::vector<int> &out_grad,
+                                             const std::vector<int> &in_data,
+                                             const std::vector<int> &out_data)
+      const override {
+    return {in_data[warpctc_enum::kData],
+          in_data[warpctc_enum::kLabel],
+          out_data[warpctc_enum::kOut]};
+  }
+
+  Operator* CreateOperator(Context ctx) const override;
+
+ private:
+  WarpCTCParam param_;
+};
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // PLUGIN_WARPCTC_WARPCTC_INL_H_
diff --git a/plugin/warpctc/warpctc.cc b/plugin/warpctc/warpctc.cc
new file mode 100644
index 000000000000..db88a3316c7e
--- /dev/null
+++ b/plugin/warpctc/warpctc.cc
@@ -0,0 +1,29 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file warpctc.cc
+ * \brief warpctc op
+ * \author Liang Xiang
+*/
+
+#include "./warpctc-inl.h"
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(WarpCTCParam param) {
+  return new WarpCTCOp<cpu>(param);
+}
+
+Operator *WarpCTCProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(WarpCTCParam);
+
+MXNET_REGISTER_OP_PROPERTY(WarpCTC, WarpCTCProp)
+.describe("warp ctc.")
+.add_arguments(WarpCTCParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/warpctc/warpctc.cu b/plugin/warpctc/warpctc.cu
new file mode 100644
index 000000000000..186c4d0c18f4
--- /dev/null
+++ b/plugin/warpctc/warpctc.cu
@@ -0,0 +1,19 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file warpctc.cc
+ * \brief warpctc op
+ * \author Liang Xiang
+*/
+#include "./warpctc-inl.h"
+#include <stdio.h>
+#include "../../src/operator/mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(WarpCTCParam param) {
+  return new WarpCTCOp<gpu>(param);
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/plugin/warpctc/warpctc.mk b/plugin/warpctc/warpctc.mk
new file mode 100644
index 000000000000..2223879ee3e3
--- /dev/null
+++ b/plugin/warpctc/warpctc.mk
@@ -0,0 +1,7 @@
+CFLAGS += -I$(WARPCTC_PATH)/include
+LDFLAGS += -L$(WARPCTC_PATH)/build -lwarpctc
+
+WARPCTC_SRC = $(wildcard plugin/warpctc/*.cc)
+PLUGIN_OBJ += $(patsubst %.cc, build/%.o, $(WARPCTC_SRC))
+WARPCTC_CUSRC = $(wildcard plugin/warpctc/*.cu)
+PLUGIN_CUOBJ += $(patsubst %.cu, build/%_gpu.o, $(WARPCTC_CUSRC))
diff --git a/ps-lite b/ps-lite
index 8aff164580f0..35ddccd4cd03 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 8aff164580f0e4ff81ad98038b6ec4ec02452ce8
+Subproject commit 35ddccd4cd0302f78ed2a05f1258860d4666e43c
diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/_ndarray_internal.py
new file mode 100644
index 000000000000..cbe2bcd96220
--- /dev/null
+++ b/python/mxnet/_ndarray_internal.py
@@ -0,0 +1 @@
+"""NDArray namespace used to register internal functions"""
diff --git a/python/mxnet/_symbol_internal.py b/python/mxnet/_symbol_internal.py
new file mode 100644
index 000000000000..d798f8d3704a
--- /dev/null
+++ b/python/mxnet/_symbol_internal.py
@@ -0,0 +1 @@
+"""Symbol namespace used to register internal functions"""
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index 8265eb39b52c..618d8cd4d783 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -8,22 +8,26 @@
 import time
 from .model import save_checkpoint
 
-def do_checkpoint(prefix):
+def do_checkpoint(prefix, period=1):
     """Callback to checkpoint the model to prefix every epoch.
 
     Parameters
     ----------
     prefix : str
         The file prefix to checkpoint to
+    period : int
+    	How many epochs to wait before checkpointing. Default is 1.
 
     Returns
     -------
     callback : function
         The callback function that can be passed as iter_end_callback to fit.
     """
+    period = int(max(1, period))
     def _callback(iter_no, sym, arg, aux):
         """The checkpoint function."""
-        save_checkpoint(prefix, iter_no + 1, sym, arg, aux)
+        if (iter_no + 1) % period == 0:
+            save_checkpoint(prefix, iter_no + 1, sym, arg, aux)
     return _callback
 
 
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 13a5c3099864..32c6ec1748a4 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -98,19 +98,26 @@ def forward(self, is_train=False, **kwargs):
         >>> # doing forward by not specifying things, but copy to the executor before hand
         >>> mydata.copyto(texec.arg_dict['data'])
         >>> texec.forward(is_train=True)
+        >>> # doing forward by specifying data and get outputs
+        >>> outputs = texec.forward(is_train=True, data=mydata)
+        >>> print(outputs[0].asnumpy())
         """
         if len(kwargs) != 0:
             arg_dict = self.arg_dict
             for name, array in kwargs.items():
-                if not isinstance(array, NDArray):
-                    raise ValueError('only accept keyword argument of NDArrays')
+                if not isinstance(array, (NDArray, np.ndarray)):
+                    raise ValueError('only accept keyword argument of NDArrays and numpy.ndarray')
                 if name not in arg_dict:
                     raise TypeError('Unknown argument %s' % name)
-                array.copyto(arg_dict[name])
+                if arg_dict[name].shape != array.shape:
+                    raise ValueError('Shape not match! Argument %s, need: %s, received: %s'
+                                     %(name, str(arg_dict[name].shape), str(array.shape)))
+                arg_dict[name][:] = array
 
         check_call(_LIB.MXExecutorForward(
             self.handle,
             ctypes.c_int(int(is_train))))
+        return self.outputs
 
     def backward(self, out_grads=None):
         """Do backward pass to get the gradient of arguments.
diff --git a/python/mxnet/executor_manager.py b/python/mxnet/executor_manager.py
index 5a8f18e6612c..cc41691d342b 100644
--- a/python/mxnet/executor_manager.py
+++ b/python/mxnet/executor_manager.py
@@ -442,6 +442,7 @@ def load_data_batch(self, data_batch):
             self.curr_execgrp = execgrp
         else:
             self.curr_execgrp = self.execgrp
+
         self.curr_execgrp.load_data_batch(data_batch)
 
     def forward(self, is_train=False):
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 3d59443419e6..47aa0bd3a7b9 100644
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -1,4 +1,5 @@
 # coding: utf-8
+# pylint: disable=too-many-branches
 """Initialization helper for mxnet"""
 from __future__ import absolute_import
 
@@ -29,6 +30,10 @@ def __call__(self, name, arr):
             raise TypeError('arr must be NDArray')
         if name.startswith('upsampling'):
             self._init_bilinear(name, arr)
+        elif name.startswith('stn_loc') and name.endswith('weight'):
+            self._init_zero(name, arr)
+        elif name.startswith('stn_loc') and name.endswith('bias'):
+            self._init_loc_bias(name, arr)
         elif name.endswith('bias'):
             self._init_bias(name, arr)
         elif name.endswith('gamma'):
@@ -59,6 +64,11 @@ def _init_bilinear(self, _, arr):
             weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
         arr[:] = weight.reshape(shape)
 
+    def _init_loc_bias(self, _, arr):
+        shape = arr.shape
+        assert(shape[0] == 6)
+        arr[:] = np.array([1.0, 0, 0, 0, 1.0, 0])
+
     def _init_zero(self, _, arr):
         arr[:] = 0.0
 
@@ -82,6 +92,7 @@ def _init_default(self, name, _):
         raise ValueError('Unknown initialization pattern for %s' % name)
     # pylint: enable=no-self-use, missing-docstring, invalid-name
 
+
 class Load(object):
     """Initialize by loading pretrained param from file or dict
 
@@ -124,6 +135,7 @@ def __call__(self, name, arr):
             if self.verbose:
                 logging.info('Initialized %s by default', name)
 
+
 class Mixed(object):
     """Initialize with mixed Initializer
 
@@ -176,6 +188,7 @@ def __init__(self, sigma=0.01):
     def _init_weight(self, _, arr):
         random.normal(0, self.sigma, out=arr)
 
+
 class Orthogonal(Initializer):
     """Intialize weight as Orthogonal matrix
 
@@ -255,3 +268,19 @@ def _init_weight(self, _, arr):
             random.normal(0, scale, out=arr)
         else:
             raise ValueError("Unknown random type")
+
+class MSRAPrelu(Xavier):
+    """Initialize the weight with initialization scheme from
+        Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification.
+
+    Parameters
+    ----------
+    factor_type: str, optional
+        Use ```avg```, ```in```, or ```out``` to init
+
+    slope: float, optional
+        initial slope of any PReLU (or similar) nonlinearities.
+    """
+    def __init__(self, factor_type="avg", slope=0.25):
+        magnitude = 2. / (1 + slope ** 2)
+        super(MSRAPrelu, self).__init__("gaussian", factor_type, magnitude)
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 382db89a448a..532c6d12ebf2 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -16,6 +16,7 @@
 from .base import check_call, ctypes2docstring
 from .ndarray import NDArray
 from .ndarray import array
+from .ndarray import concatenate
 
 
 class DataBatch(object):
@@ -307,11 +308,11 @@ def _init_data(data, allow_empty, default_name):
         raise TypeError("Input must be NDArray, numpy.ndarray, " + \
                 "a list of them or dict with them as values")
     for k, v in data.items():
-        if isinstance(v, NDArray):
-            data[k] = v.asnumpy()
-    for k, v in data.items():
-        if not isinstance(v, np.ndarray):
-            raise TypeError(("Invalid type '%s' for %s, "  % (type(v), k)) + \
+        if not isinstance(v, NDArray):
+            try:
+                data[k] = array(v)
+            except:
+                raise TypeError(("Invalid type '%s' for %s, "  % (type(v), k)) + \
                     "should be NDArray or numpy.ndarray")
 
     return list(data.items())
@@ -348,8 +349,8 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False, last_batch_han
         if shuffle:
             idx = np.arange(self.data[0][1].shape[0])
             np.random.shuffle(idx)
-            self.data = [(k, v[idx]) for k, v in self.data]
-            self.label = [(k, v[idx]) for k, v in self.label]
+            self.data = [(k, array(v.asnumpy()[idx], v.context)) for k, v in self.data]
+            self.label = [(k, array(v.asnumpy()[idx], v.context)) for k, v in self.label]
 
         self.data_list = [x[1] for x in self.data] + [x[1] for x in self.label]
         self.num_source = len(self.data_list)
@@ -411,11 +412,10 @@ def _getdata(self, data_source):
         """Load data from underlying arrays, internal use only"""
         assert(self.cursor < self.num_data), "DataIter needs reset."
         if self.cursor + self.batch_size <= self.num_data:
-            return [array(x[1][self.cursor:self.cursor+self.batch_size]) for x in data_source]
+            return [x[1][self.cursor:self.cursor+self.batch_size] for x in data_source]
         else:
             pad = self.batch_size - self.num_data + self.cursor
-            return [array(np.concatenate((x[1][self.cursor:], x[1][:pad]),
-                                         axis=0)) for x in data_source]
+            return [concatenate([x[1][self.cursor:], x[1][:pad]]) for x in data_source]
 
     def getdata(self):
         return self._getdata(self.data)
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 6cf82d8ae3ff..54b49c4290b7 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -14,7 +14,8 @@ def find_lib_path():
     """
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     api_path = os.path.join(curr_path, '../../lib/')
-    dll_path = [curr_path, api_path]
+    cmake_build_path = os.path.join(curr_path, '../../build/Release/')
+    dll_path = [curr_path, api_path, cmake_build_path]
     if os.name == 'nt':
         vs_configuration = 'Release'
         if platform.architecture()[0] == '64bit':
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 6d8204e6f23d..614f01813505 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -232,6 +232,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
             do_reset = True
             for data_batch in train_data:
                 executor_manager.load_data_batch(data_batch)
+
                 if monitor is not None:
                     monitor.tic()
 
@@ -273,7 +274,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                     do_reset = False
                     break
 
-            if do_reset is True:
+            if do_reset == True:
                 logger.info('Epoch[%d] Resetting Data Iterator', epoch)
                 train_data.reset()
 
@@ -315,6 +316,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
             name_value = eval_metric.get_name_value()
             for name, value in name_value:
                 logger.info('Epoch[%d] Validation-%s=%f', epoch, name, value)
+            eval_data.reset()
     # end of all epochs
     return
 
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 5549965ca702..d73070284fe4 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-arguments, too-many-locals, too-many-public-methods
+# pylint: disable=too-many-arguments, too-many-locals, too-many-public-methods, too-many-branches
 """`BaseModule` defines an API for modules."""
 
 import logging
@@ -276,7 +276,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
             eval_batch_end_callback=None, initializer=Uniform(0.01),
             arg_params=None, aux_params=None, allow_missing=False,
             force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
-            validation_metric=None):
+            validation_metric=None, monitor=None):
         """Train the module parameters.
 
         Parameters
@@ -327,11 +327,12 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
         num_epoch : int
             Number of epochs to run training.
         """
-
         assert num_epoch is not None, 'please specify number of epochs'
 
         self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
                   for_training=True, force_rebind=force_rebind)
+        if monitor is not None:
+            self.install_monitor(monitor)
         self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
                          allow_missing=allow_missing, force_init=force_init)
         self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
@@ -349,10 +350,15 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
             tic = time.time()
             eval_metric.reset()
             for nbatch, data_batch in enumerate(train_data):
+                if monitor is not None:
+                    monitor.tic()
                 self.forward_backward(data_batch)
                 self.update()
                 self.update_metric(eval_metric, data_batch.label)
 
+                if monitor is not None:
+                    monitor.toc_print()
+
                 if batch_end_callback is not None:
                     batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
                                                      eval_metric=eval_metric,
@@ -452,7 +458,7 @@ def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=Non
         """
         raise NotImplementedError()
 
-    def set_params(self, arg_params, aux_params):
+    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True):
         """Assign parameter and aux state values.
 
         Parameters
@@ -461,9 +467,15 @@ def set_params(self, arg_params, aux_params):
             Dictionary of name to value (`NDArray`) mapping.
         aux_params : dict
             Dictionary of name to value (`NDArray`) mapping.
+        allow_missing : bool
+            If true, params could contain missing values, and the initializer will be
+            called to fill those missing params.
+        force_init : bool
+            If true, will force re-initialize even if already initialized.
+
         """
         self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
-                         allow_missing=False, force_init=True)
+                         allow_missing=allow_missing, force_init=force_init)
 
     def save_params(self, fname):
         """Save model parameters to file.
@@ -499,6 +511,10 @@ def load_params(self, fname):
                 raise ValueError("Invalid param file " + fname)
         self.set_params(arg_params, aux_params)
 
+    def install_monitor(self, mon):
+        """Install monitor on all executors"""
+        raise NotImplementedError()
+
     ################################################################################
     # Computations
     ################################################################################
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index 7ab039ea45d4..94f47948415e 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -199,7 +199,7 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
             Typically `data_batch.provide_label`.
         """
         assert self.binded, 'call bind before switching bucket'
-        if not self._buckets.has_key(bucket_key):
+        if not bucket_key in self._buckets:
             symbol, data_names, label_names = self._sym_gen(bucket_key)
             module = Module(symbol, data_names, label_names,
                             logger=self.logger, context=self._context,
@@ -236,7 +236,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
 
         self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params,
                                          force_init=force_init)
-        for mod in self._buckets.itervalues():
+        for mod in self._buckets.values():
             if mod is not self._curr_module:
                 mod.borrow_optimizer(self._curr_module)
 
@@ -325,3 +325,9 @@ def symbol(self):
         """The symbol of the current bucket being used."""
         assert self.binded
         return self._curr_module.symbol
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        for mod in self._buckets.values():
+            mod.install_monitor(mon)
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index f51d94df4799..62e815e487ec 100644
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -57,10 +57,13 @@ class DataParallelExecutorGroup(object):
         of the data/label inputs.
     logger : Logger
         Default is `logging`.
+    fixed_param_names: list of str
+        Indicate parameters to be fixed during training. Parameters in this list will not allocate
+        space for gradient, nor do gradient calculation.
     """
     def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_names,
                  for_training, inputs_need_grad, shared_group=None, input_types=None,
-                 logger=logging):
+                 logger=logging, fixed_param_names=None):
         self.param_names = param_names
         self.arg_names = symbol.list_arguments()
         self.aux_names = symbol.list_auxiliary_states()
@@ -75,6 +78,10 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
         self.input_types = input_types
         self.logger = logger
 
+        self.fixed_param_names = fixed_param_names
+        if self.fixed_param_names is None:
+            self.fixed_param_names = []
+
         if shared_group is not None:
             self.shared_data_arrays = shared_group.shared_data_arrays
         else:
@@ -335,7 +342,7 @@ def _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group):
         grad_req = {}
         for name in self.arg_names:
             if self.for_training:
-                if name in self.param_names:
+                if name in self.param_names and name not in self.fixed_param_names:
                     grad_req[name] = 'write'
                 elif name in data_names:
                     grad_req[name] = 'write' if self.inputs_need_grad else 'null'
@@ -424,3 +431,8 @@ def _sliced_shape(self, shapes, i):
         """
         return [(k, tuple([self.slices[i].stop-self.slices[i].start] + list(v[1:])))
                 for k, v in shapes]
+
+    def install_monitor(self, mon):
+        """Install monitor on all executors"""
+        for exe in self.execs:
+            mon.install(exe)
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 3156ac98948c..36f92f084881 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -33,9 +33,11 @@ class Module(BaseModule):
         Default is `cpu()`.
     work_load_list : list of number
         Default `None`, indicating uniform workload.
+    fixed_param_names: list of str
+        Default `None`, indicating no network parameters are fixed.
     """
     def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
-                 logger=logging, context=ctx.cpu(), work_load_list=None):
+                 logger=logging, context=ctx.cpu(), work_load_list=None, fixed_param_names=None):
         super(Module, self).__init__(logger=logger)
 
         if isinstance(context, ctx.Context):
@@ -54,6 +56,7 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
         arg_names = symbol.list_arguments()
         input_names = data_names + label_names
         self._param_names = [x for x in arg_names if x not in input_names]
+        self._fixed_param_names = fixed_param_names
         self._aux_names = symbol.list_auxiliary_states()
         self._data_names = data_names
         self._label_names = label_names
@@ -169,15 +172,17 @@ def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=Non
         def _impl(name, arr, cache):
             """Internal helper for parameter initialization"""
             if cache is not None:
-                if cache.has_key(name):
+                if name in cache:
                     cache_arr = cache[name]
 
                     # just in case the cached array is just the target itself
                     if cache_arr is not arr:
                         cache_arr.copyto(arr)
                 else:
-                    assert allow_missing
-                    initializer(name, arr)
+                    if not allow_missing:
+                        raise RuntimeError("%s is not presented" % name)
+                    if initializer != None:
+                        initializer(name, arr)
             else:
                 initializer(name, arr)
 
@@ -253,14 +258,13 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                                                      self._work_load_list, data_shapes,
                                                      label_shapes, self._param_names,
                                                      for_training, inputs_need_grad,
-                                                     shared_group, logger=self.logger)
-
+                                                     shared_group, logger=self.logger,
+                                                     fixed_param_names=self._fixed_param_names)
         if shared_module is not None:
             self.params_initialized = True
             self._arg_params = shared_module._arg_params
             self._aux_params = shared_module._aux_params
-
-        if self.params_initialized:
+        elif self.params_initialized:
             # if the parameters are already initialized, we are re-binding
             # so automatically copy the already initialized params
             self._exec_group.set_params(self._arg_params, self._aux_params)
@@ -449,3 +453,8 @@ def _sync_params_from_devices(self):
         latest parameters from `self._arg_params` and `self._aux_params`.
         """
         self._exec_group.get_params(self._arg_params, self._aux_params)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        self._exec_group.install_monitor(mon)
diff --git a/python/mxnet/module/python_module.py b/python/mxnet/module/python_module.py
index 09866d8948e4..ab9b952010a7 100644
--- a/python/mxnet/module/python_module.py
+++ b/python/mxnet/module/python_module.py
@@ -326,3 +326,7 @@ def get_input_grads(self, merge_multi_context=True):
         """
         assert merge_multi_context == True
         return [self._scores_grad]
+
+    def install_monitor(self, mon):
+        """Install monitor on all executors"""
+        raise NotImplementedError()
diff --git a/python/mxnet/module/sequential_module.py b/python/mxnet/module/sequential_module.py
index 75c499dfafc7..3e9ac3d49855 100644
--- a/python/mxnet/module/sequential_module.py
+++ b/python/mxnet/module/sequential_module.py
@@ -383,3 +383,9 @@ def update_metric(self, eval_metric, labels):
             if meta.has_key(SequentialModule.META_TAKE_LABELS) and \
                     meta[SequentialModule.META_TAKE_LABELS]:
                 module.update_metric(eval_metric, labels)
+
+    def install_monitor(self, mon):
+        """ Install monitor on all executors """
+        assert self.binded
+        for module in self._modules:
+            module.install_monitor(mon)
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 12ad735ffc72..3d61f95b6b21 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -1,7 +1,8 @@
 # coding: utf-8
-# pylint: disable= too-many-lines, redefined-builtin
+# pylint: disable= too-many-lines, redefined-builtin, protected-access
 """NDArray API of mxnet."""
 from __future__ import absolute_import
+from __future__ import division
 
 import ctypes
 import warnings
@@ -15,7 +16,9 @@
 from .base import ctypes2buffer
 from .base import check_call, ctypes2docstring
 from .context import Context
+from . import _ndarray_internal as _internal
 
+# pylint: disable= no-member
 _DTYPE_NP_TO_MX = {
     np.float32 : 0,
     np.float64 : 1,
@@ -31,6 +34,7 @@
     3 : np.uint8,
     4 : np.int32
 }
+# pylint: enable= no-member
 
 def _new_empty_handle():
     """Return a new empty handle.
@@ -100,9 +104,9 @@ def __iadd__(self, other):
         if not self.writable:
             raise ValueError('trying to add to a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._plus(self, other, out=self)
+            return _internal._plus(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._plus_scalar(self, float(other), out=self)
+            return _internal._plus_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -116,9 +120,9 @@ def __isub__(self, other):
         if not self.writable:
             raise ValueError('trying to subtract from a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._minus(self, other, out=self)
+            return _internal._minus(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._minus_scalar(self, float(other), out=self)
+            return _internal._minus_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -129,15 +133,15 @@ def __mul__(self, other):
         return multiply(self, other)
 
     def __neg__(self):
-        return NDArray._mul_scalar(self, -1.0)
+        return _internal._mul_scalar(self, -1.0)
 
     def __imul__(self, other):
         if not self.writable:
             raise ValueError('trying to multiply to a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._mul(self, other, out=self)
+            return _internal._mul(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._mul_scalar(self, float(other), out=self)
+            return _internal._mul_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -154,14 +158,26 @@ def __idiv__(self, other):
         if not self.writable:
             raise ValueError('trying to divide from a readonly NDArray')
         if isinstance(other, NDArray):
-            return NDArray._div(self, other, out=self)
+            return _internal._div(self, other, out=self)
         elif isinstance(other, numeric_types):
-            return NDArray._div_scalar(self, float(other), out=self)
+            return _internal._div_scalar(self, float(other), out=self)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __truediv__(self, other):
-        return self.__div__(other)
+        return divide(self, other)
+
+    def __rtruediv__(self, other):
+        return divide(other, self)
+
+    def __itruediv__(self, other):
+        return self.__idiv__(other)
+
+    def __pow__(self, other):
+        return power(self, other)
+
+    def __rpow__(self, other):
+        return power(other, self)
 
     def __getstate__(self):
         this = self.__dict__.copy()
@@ -204,7 +220,7 @@ def __setitem__(self, in_slice, value):
             if value.handle is not self.handle:
                 value.copyto(self)
         elif isinstance(value, numeric_types):
-            NDArray._set_value(float(value), out=self)
+            _internal._set_value(float(value), out=self)
         elif isinstance(value, (np.ndarray, np.generic)):
             self._sync_copyfrom(value)
         else:
@@ -300,20 +316,19 @@ def broadcast_to(self, shape):
             the broadcast shape
         """
         cur_shape = self.shape
-        err_str = 'operands could not be broadcast together with remapped shapes'\
-                '[original->remapped]: {} and requested shape {}'.format(cur_shape, shape)
+        err_str = 'operands could not be broadcast together with remapped shapes' \
+                  '[original->remapped]: {} and requested shape {}'.format(cur_shape, shape)
         if len(shape) < len(cur_shape):
             raise ValueError(err_str)
         cur_shape = (1,) * (len(shape) - len(cur_shape)) + cur_shape
-        cur_shape = np.array(cur_shape)
-        shape = np.array(shape)
-        broadcasting_axes = np.nonzero(cur_shape != shape)
-        if (cur_shape[broadcasting_axes] != 1).any():
+        cur_shape_arr = np.array(cur_shape)
+        broadcasting_axes = np.nonzero(cur_shape_arr != np.array(shape))
+        if (cur_shape_arr[broadcasting_axes] != 1).any():
             raise ValueError(err_str)
-        ret = self.reshape(tuple(cur_shape))
-        for axis in broadcasting_axes[0]:
-            ret = broadcast_axis(ret, axis=axis, size=shape[axis])
-        return ret
+        if cur_shape != self.shape:
+            return broadcast_to(self.reshape(cur_shape), shape=shape)
+        else:
+            return broadcast_to(self, shape=tuple(shape))
     # pylint: enable= undefined-variable
 
     def wait_to_read(self):
@@ -454,10 +469,10 @@ def copyto(self, other):
                 warnings.warn('copy an array to itself, is it intended?',
                               RuntimeWarning)
                 return
-            return NDArray._copyto(self, out=other)
+            return _internal._copyto(self, out=other)
         elif isinstance(other, Context):
             hret = NDArray(_new_alloc_handle(self.shape, other, True, self.dtype))
-            return NDArray._copyto(self, out=hret)
+            return _internal._copyto(self, out=hret)
         else:
             raise TypeError('copyto do not support type ' + str(type(other)))
 
@@ -509,7 +524,7 @@ def onehot_encode(indices, out):
         Same as out.
     """
     # pylint: disable= no-member, protected-access
-    return NDArray._onehot_encode(indices, out, out=out)
+    return _internal._onehot_encode(indices, out, out=out)
     # pylint: enable= no-member, protected-access
 
 
@@ -535,40 +550,85 @@ def empty(shape, ctx=None, dtype=mx_real_t):
         ctx = Context.default_ctx
     return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
 
-def add(lhs, rhs):
-    """ Perform element-wise addition
+#pylint: disable= too-many-arguments, no-member, protected-access
+def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
+    """ Helper function for element-wise operation
+    The function will perform numpy-like broadcasting if needed and call different functions
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
+    lhs : NDArray or numeric value
+        left hande side operand
 
-    rhs : Array of float value
+    rhs : NDArray or numeric value
         right hand side operand
 
+    fn_array : function
+        function to be called if both lhs and rhs are of NDArray type
+
+    fn_scalar : function
+        function to be called if both lhs and rhs are numeric values
+
+    lfn_scalar : function
+        function to be called if lhs is NDArray while rhs is numeric value
+
+    rfn_scalar : function
+        function to be called if lhs is numeric value while rhs is NDArray;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
     Returns
     -------
-    out: Array
+    out: NDArray
         result array
     """
-    # pylint: disable= no-member, protected-access
     if isinstance(lhs, numeric_types):
         if isinstance(rhs, numeric_types):
-            return lhs + rhs
+            return fn_scalar(lhs, rhs)
         else:
-            return add(rhs, lhs)
+            if rfn_scalar is None:
+                # commutative function
+                return lfn_scalar(rhs, float(lhs))
+            else:
+                return rfn_scalar(rhs, float(lhs))
     elif isinstance(rhs, numeric_types):
-        return NDArray._plus_scalar(lhs, float(rhs))
+        return lfn_scalar(lhs, float(rhs))
     elif isinstance(rhs, NDArray):
+        # check whether broadcasting is needed
         lsize = functools.reduce(operator.mul, lhs.shape)
         rsize = functools.reduce(operator.mul, rhs.shape)
         if lsize < rsize:
             lhs = lhs.broadcast_to(rhs.shape)
         elif lsize > rsize:
             rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._plus(lhs, rhs)
+        return fn_array(lhs, rhs)
     else:
         raise TypeError('type %s not supported' % str(type(rhs)))
+#pylint: enable= too-many-arguments, no-member, protected-access
+
+def add(lhs, rhs):
+    """ Perform element-wise addition
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    # pylint: disable= no-member, protected-access
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._plus,
+        operator.add,
+        _internal._plus_scalar,
+        None)
     # pylint: enable= no-member, protected-access
 
 def subtract(lhs, rhs):
@@ -588,25 +648,13 @@ def subtract(lhs, rhs):
         result array
     """
     # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs - rhs
-        elif isinstance(rhs, NDArray):
-            return NDArray._rminus_scalar(rhs, float(lhs))
-        else:
-            raise TypeError('type %s not supported' % str(type(rhs)))
-    elif isinstance(rhs, numeric_types):
-        return NDArray._minus_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._minus(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._minus,
+        operator.sub,
+        _internal._minus_scalar,
+        _internal._rminus_scalar)
     # pylint: enable= no-member, protected-access
 
 def multiply(lhs, rhs):
@@ -626,23 +674,13 @@ def multiply(lhs, rhs):
         result array
     """
     # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs * rhs
-        else:
-            return multiply(rhs, lhs)
-    elif isinstance(rhs, numeric_types):
-        return NDArray._mul_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._mul(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._mul,
+        operator.mul,
+        _internal._mul_scalar,
+        None)
     # pylint: enable= no-member, protected-access
 
 def divide(lhs, rhs):
@@ -662,25 +700,91 @@ def divide(lhs, rhs):
         result array
     """
     # pylint: disable= no-member, protected-access
-    if isinstance(lhs, numeric_types):
-        if isinstance(rhs, numeric_types):
-            return lhs / rhs
-        elif isinstance(rhs, NDArray):
-            return NDArray._rdiv_scalar(rhs, float(lhs))
-        else:
-            raise TypeError('type %s not supported' % str(type(rhs)))
-    elif isinstance(rhs, numeric_types):
-        return NDArray._div_scalar(lhs, float(rhs))
-    elif isinstance(rhs, NDArray):
-        lsize = functools.reduce(operator.mul, lhs.shape)
-        rsize = functools.reduce(operator.mul, rhs.shape)
-        if lsize < rsize:
-            lhs = lhs.broadcast_to(rhs.shape)
-        elif lsize > rsize:
-            rhs = rhs.broadcast_to(lhs.shape)
-        return NDArray._div(lhs, rhs)
-    else:
-        raise TypeError('type %s not supported' % str(type(rhs)))
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._div,
+        operator.truediv,
+        _internal._div_scalar,
+        _internal._rdiv_scalar)
+    # pylint: enable= no-member, protected-access
+
+def power(lhs, rhs):
+    """ Perform power operator
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    # pylint: disable= no-member, protected-access
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._power,
+        operator.pow,
+        _internal._power_scalar,
+        _internal._rpower_scalar)
+    # pylint: enable= no-member, protected-access
+
+def maximum(lhs, rhs):
+    """ Perform maximum operator
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    # pylint: disable= no-member, protected-access
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._maximum,
+        lambda x, y: x if x > y else y,
+        _internal._maximum_scalar,
+        None)
+    # pylint: enable= no-member, protected-access
+
+def minimum(lhs, rhs):
+    """ Perform minimum operator
+
+    Parameters
+    ----------
+    lhs : Array or float value
+        left hand side operand
+
+    rhs : Array of float value
+        right hand side operand
+
+    Returns
+    -------
+    out: Array
+        result array
+    """
+    # pylint: disable= no-member, protected-access
+    return _ufunc_helper(
+        lhs,
+        rhs,
+        _internal._minimum,
+        lambda x, y: x if x < y else y,
+        _internal._minimum_scalar,
+        None)
     # pylint: enable= no-member, protected-access
 
 def true_divide(lhs, rhs):
@@ -731,125 +835,6 @@ def ones(shape, ctx=None, dtype=mx_real_t):
     arr[:] = 1.0
     return arr
 
-# pylint: disable=too-many-locals, invalid-name, no-member, protected-access, undefined-variable
-# pylint: disable=too-many-branches
-def _reduce(arr, axis=None, keepdims=False, typ='sum'):
-    """ Reduce the array along given axises. The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    if 'sum' == typ:
-        reduce_func = sum_axis
-    elif 'max' == typ:
-        reduce_func = max_axis
-    elif 'min' == typ:
-        reduce_func = min_axis
-    else:
-        raise TypeError('typ=\'%s\' is not supported.' % typ)
-    ndim = len(arr.shape)
-    if axis is None:
-        axis = list(range(ndim))
-    elif isinstance(axis, int):
-        axis = [axis]
-    elif isinstance(axis, tuple) or isinstance(axis, list):
-        axis = list(axis)
-    else:
-        raise TypeError('\'%s\' object is not supported as axis.' % type(axis).__name__)
-
-    if list(range(ndim)) == axis:
-        ret = reduce_func(arr, axis=-1, keepdims=keepdims)
-        if not keepdims:
-            return ret.asnumpy()[0]
-        else:
-            return ret
-    for i in axis:
-        if not isinstance(i, int):
-            raise TypeError('\'%s\' object cannot be interpreted as an integer' % type(i).__name__)
-    axis = sorted([x if 0 <= x else x + ndim for x in axis])
-    for i in axis:
-        if i < 0 or ndim <= i:
-            raise ValueError('\'axis\' entry is out of bounds')
-    if len(set(axis)) != len(axis):
-        raise ValueError('duplicate value in \'axis\'')
-    assert(len(axis) != 0)
-    ret = arr
-    for i in reversed(axis):
-        ret = reduce_func(ret, axis=i, keepdims=keepdims)
-    return ret
-# pylint: enable=too-many-locals, invalid-name, no-member, protected-access, undefined-variable
-# pylint: enable=too-many-branches
-
-def sum(arr, axis=None, keepdims=False):
-    """ Sum the array along given axises. The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    return _reduce(arr=arr, axis=axis, keepdims=keepdims, typ='sum')
-
-def max(arr, axis=None, keepdims=False):
-    """ Take the maximum of the array along given axises.
-    The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    return _reduce(arr=arr, axis=axis, keepdims=keepdims, typ='max')
-
-def min(arr, axis=None, keepdims=False):
-    """ Take the minimum of the array along given axises.
-    The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    arr : Array
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Array
-        The reduced NDArray.
-    """
-    return _reduce(arr=arr, axis=axis, keepdims=keepdims, typ='min')
-
 def full(shape, val, ctx=None):
     """Create a new NDArray filled with given value, with specified shape.
 
@@ -1042,24 +1027,24 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea
     if mean is None:
         mean = NDArray(_new_empty_handle())
     if out is None:
-        return NDArray._imdecode(mean, index,
-                                 clip_rect[0],
-                                 clip_rect[1],
-                                 clip_rect[2],
-                                 clip_rect[3],
-                                 channels,
-                                 len(str_img),
-                                 str_img=str_img)
+        return _internal._imdecode(mean, index,
+                                   clip_rect[0],
+                                   clip_rect[1],
+                                   clip_rect[2],
+                                   clip_rect[3],
+                                   channels,
+                                   len(str_img),
+                                   str_img=str_img)
     else:
-        return NDArray._imdecode(mean, index,
-                                 clip_rect[0],
-                                 clip_rect[1],
-                                 clip_rect[2],
-                                 clip_rect[3],
-                                 channels,
-                                 len(str_img),
-                                 str_img=str_img,
-                                 out=out)
+        return _internal._imdecode(mean, index,
+                                   clip_rect[0],
+                                   clip_rect[1],
+                                   clip_rect[2],
+                                   clip_rect[3],
+                                   channels,
+                                   len(str_img),
+                                   str_img=str_img,
+                                   out=out)
 
 # pylint: disable=too-many-locals, invalid-name
 def _make_ndarray_function(handle):
@@ -1123,7 +1108,7 @@ def binary_ndarray_function(lhs, rhs, out=None, **kwargs):
         """Internal binary function
         """
         if out:
-            if isinstance(out, NDArray) == False:
+            if not isinstance(out, NDArray):
                 raise TypeError('out must be NDArray')
             if not out.writable:
                 raise TypeError('out must be writable')
@@ -1144,7 +1129,7 @@ def binary_ndarray_function(lhs, rhs, out=None, **kwargs):
     def unary_ndarray_function(src, out=None, *args, **kwargs):
         """internal NDArray function"""
         if out:
-            if isinstance(out, NDArray) == False:
+            if not isinstance(out, NDArray):
                 raise TypeError('out must be NDArray')
             if not out.writable:
                 raise TypeError('out must be writable')
@@ -1225,12 +1210,13 @@ def _init_ndarray_module():
                                     ctypes.byref(plist)))
 
     module_obj = sys.modules[__name__]
+    module_internal = sys.modules["mxnet._ndarray_internal"]
     for i in range(size.value):
         hdl = FunctionHandle(plist[i])
         function = _make_ndarray_function(hdl)
-        # if function name starts with underscore, register as static method of NDArray
+        # if function name starts with underscore, register as internal namespace
         if function.__name__.startswith('_'):
-            setattr(NDArray, function.__name__, staticmethod(function))
+            setattr(module_internal, function.__name__, function)
         else:
             fname = function.__name__
             fn_obj = getattr(module_obj, fname, None)
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 9e186eed926f..e9c0e385022d 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -211,10 +211,10 @@ def list_arguments_entry(out, _):
                                  None, None, None, None, None)
         cb_ptr = format(cast(pointer(self.info_), c_void_p).value, 'x')
         # pylint: disable=E1101
-        sym = symbol.Symbol._Native(*args,
-                                    info=cb_ptr,
-                                    need_top_grad=self.need_top_grad(),
-                                    **kwargs)
+        sym = symbol._internal._Native(*args,
+                                       info=cb_ptr,
+                                       need_top_grad=self.need_top_grad(),
+                                       **kwargs)
         # keep a reference of ourself in PythonOp so we don't get garbage collected.
         PythonOp._ref_holder.append(self)
         return sym
@@ -358,9 +358,9 @@ def declare_backward_dependency(out_grad, in_data, out_data, num_dep, deps, _):
                                    None, None, None, None, None, None)
         cb_ptr = format(cast(pointer(self.info_), c_void_p).value, 'x')
         # pylint: disable=E1101
-        sym = symbol.Symbol._NDArray(*args,
-                                     info=cb_ptr,
-                                     **kwargs)
+        sym = symbol._internal._NDArray(*args,
+                                        info=cb_ptr,
+                                        **kwargs)
         # keep a reference of ourself in PythonOp so we don't get garbage collected.
         PythonOp._ref_holder.append(self)
         return sym
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 18c5da666843..94a84232f81f 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -5,7 +5,7 @@
 from .base import _LIB, check_call
 from .base import c_array, mx_uint, mx_float, c_str
 from .base import OptimizerHandle, OptimizerCreator
-from .ndarray import NDArray, zeros, clip, sqrt
+from .ndarray import NDArray, zeros, clip, sqrt, square
 from .random import normal
 
 
@@ -90,7 +90,7 @@ def _init_cc_optimizer(name, param_keys, param_vals):
 
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=0.01,
-                 lr_scheduler=None, sym=None):
+                 lr_scheduler=None, sym=None, begin_num_update=0):
         self.rescale_grad = rescale_grad
         self.lr = learning_rate
         self.lr_scheduler = lr_scheduler
@@ -100,7 +100,8 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.wd = wd
         self.lr_mult = {}
         self.wd_mult = {}
-        self.num_update = 0
+        self.begin_num_update = begin_num_update
+        self.num_update = begin_num_update
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
 
@@ -176,7 +177,7 @@ def _update_count(self, index):
             The index will be updated
         """
         if index not in self._index_update_count:
-            self._index_update_count[index] = 0
+            self._index_update_count[index] = self.begin_num_update
         self._index_update_count[index] += 1
         self.num_update = max(self._index_update_count[index], self.num_update)
 
@@ -586,14 +587,17 @@ def update(self, index, weight, grad, state):
         if self.clip_gradient is not None:
             clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
 
-        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
-        variance[:] = self.beta2 * variance + (1. - self.beta2) * grad * grad
+        mean *= self.beta1
+        mean += grad * (1. - self.beta1)
+
+        variance *= self.beta2
+        variance += (1 - self.beta2) * square(grad, out=grad)
 
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
 
-        weight[:] -= lr*mean/(sqrt(variance) + self.epsilon)
+        weight -= lr*mean/(sqrt(variance) + self.epsilon)
 
         wd = self._get_wd(index)
         if wd > 0.:
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index 489a8bd16097..b54e40d653bb 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -5,7 +5,8 @@
 
 import ctypes
 from .base import _LIB, check_call
-from .ndarray import NDArray, empty
+from .ndarray import empty
+from . import _ndarray_internal as _internal
 
 
 def uniform(low, high, shape=None, ctx=None, out=None):
@@ -38,17 +39,17 @@ def uniform(low, high, shape=None, ctx=None, out=None):
         if isinstance(shape, int):
             shape = (shape,)
         out = empty(shape, ctx)
-    return NDArray._random_uniform(low, high, out=out)
+    return _internal._sample_uniform(low=low, high=high, shape=out.shape, out=out)
 
 
-def normal(mean, stdvar, shape=None, ctx=None, out=None):
+def normal(loc, scale, shape=None, ctx=None, out=None):
     """Generate normal(Gaussian) distribution N(mean, stdvar^2) with shape.
 
     Parameters
     ----------
-    mean : float
+    loc : float
         The mean of the normal distribution.
-    stdvar : float
+    scale : float
         The standard deviation of normal distribution.
     shape : tuple, optional
         Output shape of the NDArray generated.
@@ -71,7 +72,7 @@ def normal(mean, stdvar, shape=None, ctx=None, out=None):
         if isinstance(shape, int):
             shape = (shape,)
         out = empty(shape, ctx)
-    return NDArray._random_gaussian(mean, stdvar, out=out)
+    return _internal._sample_normal(loc=loc, scale=scale, shape=out.shape, out=out)
 
 
 def seed(seed_state):
@@ -96,4 +97,3 @@ def seed(seed_state):
         raise ValueError('sd must be int')
     seed_state = ctypes.c_int(int(seed_state))
     check_call(_LIB.MXRandomSeed(seed_state))
-
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 2577b8f65c30..dfe1890bddea 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -19,7 +19,7 @@
 from .ndarray import NDArray, zeros, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from .executor import Executor
 from .symbol_doc import SymbolDoc
-
+from . import _symbol_internal as _internal
 
 class Symbol(object):
     """Symbol is symbolic graph of the mxnet."""
@@ -37,9 +37,9 @@ def __init__(self, handle):
 
     def __add__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Plus(self, other)
+            return _internal._Plus(self, other)
         if isinstance(other, Number):
-            return Symbol._PlusScalar(self, scalar=other)
+            return _internal._PlusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -48,23 +48,23 @@ def __radd__(self, other):
 
     def __sub__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Minus(self, other)
+            return _internal._Minus(self, other)
         if isinstance(other, Number):
-            return Symbol._MinusScalar(self, scalar=other)
+            return _internal._MinusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rsub__(self, other):
         if isinstance(other, Number):
-            return Symbol._RMinusScalar(self, scalar=other)
+            return _internal._RMinusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __mul__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Mul(self, other)
+            return _internal._Mul(self, other)
         if isinstance(other, Number):
-            return Symbol._MulScalar(self, scalar=other)
+            return _internal._MulScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -73,15 +73,15 @@ def __rmul__(self, other):
 
     def __div__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Div(self, other)
+            return _internal._Div(self, other)
         if isinstance(other, Number):
-            return Symbol._DivScalar(self, scalar=other)
+            return _internal._DivScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rdiv__(self, other):
         if isinstance(other, Number):
-            return Symbol._RDivScalar(self, scalar=other)
+            return _internal._RDivScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -93,9 +93,9 @@ def __rtruediv__(self, other):
 
     def __pow__(self, other):
         if isinstance(other, Symbol):
-            return Symbol._Power(self, other)
+            return _internal._Power(self, other)
         if isinstance(other, Number):
-            return Symbol._PowerScalar(self, scalar=other)
+            return _internal._PowerScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
@@ -1091,11 +1091,12 @@ def _init_symbol_module():
     check_call(_LIB.MXSymbolListAtomicSymbolCreators(ctypes.byref(size),
                                                      ctypes.byref(plist)))
     module_obj = sys.modules[__name__]
+    module_internal = sys.modules["mxnet._symbol_internal"]
     for i in range(size.value):
         hdl = SymbolHandle(plist[i])
         function = _make_atomic_symbol_function(hdl)
         if function.__name__.startswith('_'):
-            setattr(Symbol, function.__name__, staticmethod(function))
+            setattr(module_internal, function.__name__, function)
         else:
             setattr(module_obj, function.__name__, function)
 
@@ -1118,91 +1119,17 @@ def pow(base, exp):
     result: Symbol or Number
     """
     if isinstance(base, Symbol) and isinstance(exp, Symbol):
-        return Symbol._Power(base, exp)
+        return _internal._Power(base, exp)
     if isinstance(base, Symbol) and isinstance(exp, Number):
-        return Symbol._PowerScalar(base, scalar=exp)
+        return _internal._PowerScalar(base, scalar=exp)
     if isinstance(base, Number) and isinstance(exp, Symbol):
-        return Symbol._RPowerScalar(exp, scalar=base)
+        return _internal._RPowerScalar(exp, scalar=base)
     if isinstance(base, Number) and isinstance(exp, Number):
         return base**exp
     else:
         raise TypeError('types (%s, %s) not supported' % (str(type(base)), str(type(exp))))
 
 
-# pylint: disable= undefined-variable, too-many-branches
-def _reduce(data, axis=None, keepdims=False, name=None, typ='sum'):
-    """ Reduce the array along given axis. The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    data : Symbol
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Symbol
-        Symbol represents the reduced Array.
-    """
-    if 'sum' == typ:
-        reduce_func = sum_axis
-    else:
-        raise TypeError('typ=\'%s\' is not supported.' % typ)
-    if axis is None:
-        ret = reduce_func(data, axis=-1, keepdims=keepdims, name=name)
-        return ret
-    elif isinstance(axis, int):
-        axis = [axis]
-    elif isinstance(axis, tuple) or isinstance(axis, list):
-        axis = list(axis)
-    else:
-        raise TypeError('\'%s\' object is not supported as axis.' % type(axis).__name__)
-
-    for i in axis:
-        if not isinstance(i, int):
-            raise TypeError('\'%s\' object cannot be interpreted as an integer' % type(i).__name__)
-    axis = sorted(axis)
-    for i in axis:
-        if i < 0:
-            raise ValueError('\'axis\' entry is out of bounds')
-    if len(set(axis)) != len(axis):
-        raise ValueError('duplicate value in \'axis\'')
-    assert (len(axis) != 0)
-    ret = data
-    for (i, ele) in enumerate(reversed(axis)):
-        if i == (len(axis) - 1):
-            ret = reduce_func(ret, axis=ele, keepdims=keepdims, name=name)
-        else:
-            ret = reduce_func(ret, axis=ele, keepdims=keepdims)
-    return ret
-# pylint: enable= undefined-variable, too-many-branches
-
-
-def sum(data, axis=None, keepdims=False, name=None):
-    """ Calculate the sum of the array along given axis.
-    The semantic strictly follows numpy's document.
-
-    Parameters
-    ----------
-    data : Symbol
-        the array to be reduced
-    axis : int or list(int), optional
-        along which axis to do reduction
-    keepdims : bool
-        whether the reduced axis should be kept in the final shape
-
-    Returns
-    -------
-    out: Symbol
-        Symbol represents the reduced Array.
-    """
-    return _reduce(data=data, axis=axis, keepdims=keepdims, name=name, typ='sum')
-
-
-
 # pylint: disable=no-member
 # pylint: disable=redefined-builtin
 def maximum(left, right):
@@ -1218,11 +1145,11 @@ def maximum(left, right):
     result: Symbol or Number
     """
     if isinstance(left, Symbol) and isinstance(right, Symbol):
-        return Symbol._Maximum(left, right)
+        return _internal._Maximum(left, right)
     if isinstance(left, Symbol) and isinstance(right, Number):
-        return Symbol._MaximumScalar(left, scalar=right)
+        return _internal._MaximumScalar(left, scalar=right)
     if isinstance(left, Number) and isinstance(right, Symbol):
-        return Symbol._MaximumScalar(right, scalar=left)
+        return _internal._MaximumScalar(right, scalar=left)
     if isinstance(left, Number) and isinstance(right, Number):
         return left if left > right else right
     else:
@@ -1244,11 +1171,11 @@ def minimum(left, right):
     result: Symbol or Number
     """
     if isinstance(left, Symbol) and isinstance(right, Symbol):
-        return Symbol._Minimum(left, right)
+        return _internal._Minimum(left, right)
     if isinstance(left, Symbol) and isinstance(right, Number):
-        return Symbol._MinimumScalar(left, scalar=right)
+        return _internal._MinimumScalar(left, scalar=right)
     if isinstance(left, Number) and isinstance(right, Symbol):
-        return Symbol._MinimumScalar(right, scalar=left)
+        return _internal._MinimumScalar(right, scalar=left)
     if isinstance(left, Number) and isinstance(right, Number):
         return left if left > right else right
     else:
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 1cd795978f68..5900a0a710a2 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -61,7 +61,10 @@
         <groupId>org.scalatest</groupId>
         <artifactId>scalatest-maven-plugin</artifactId>
         <configuration>
-          <argLine>-Djava.library.path=${project.parent.basedir}/native/${platform}/target</argLine> 
+          <argLine>
+            -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
+            -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
+          </argLine> 
         </configuration>
       </plugin>
       <plugin>
diff --git a/scala-package/core/scripts/get_cifar_data.sh b/scala-package/core/scripts/get_cifar_data.sh
index 48c4bfde2225..4b59e2c2ad4b 100755
--- a/scala-package/core/scripts/get_cifar_data.sh
+++ b/scala-package/core/scripts/get_cifar_data.sh
@@ -5,7 +5,7 @@ fi
 
 cifar_data_path="./data/cifar10.zip"
 if [ ! -f "$cifar_data_path" ]; then
-  wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip -P $data_path
+  wget http://data.dmlc.ml/mxnet/data/cifar10.zip -P $data_path
   cd $data_path
   unzip -u cifar10.zip
-fi
\ No newline at end of file
+fi
diff --git a/scala-package/core/scripts/get_mnist_data.sh b/scala-package/core/scripts/get_mnist_data.sh
index e080144f6663..359e399629cc 100755
--- a/scala-package/core/scripts/get_mnist_data.sh
+++ b/scala-package/core/scripts/get_mnist_data.sh
@@ -5,7 +5,7 @@ fi
 
 mnist_data_path="./data/mnist.zip"
 if [ ! -f "$mnist_data_path" ]; then
-  wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip -P $data_path
+  wget http://data.dmlc.ml/mxnet/data/mnist.zip -P $data_path
   cd $data_path
   unzip -u mnist.zip
 fi
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
index d995176f1796..cf3bee93a98a 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
@@ -43,7 +43,8 @@ object Base {
           "Copying native library from the archive. " +
           "Consider installing the library somewhere in the path " +
           "(for Windows: PATH, for Linux: LD_LIBRARY_PATH), " +
-          "or specifying by Java cmd option -Djava.library.path=[lib path].")
+          "or specifying by Java cmd option -Djava.library.path=[lib path]." +
+          "Exception:", e)
         NativeLibraryLoader.loadLibrary("mxnet-scala")
     }
   } catch {
@@ -69,6 +70,7 @@ object Base {
       System.loadLibrary(libname)
     } catch {
       case e: UnsatisfiedLinkError =>
+        logger.warn("Failed to load from native path. Exception:", e)
         val os = System.getProperty("os.name")
         // ref: http://lopica.sourceforge.net/os.html
         if (os.startsWith("Linux")) {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
index 4dcc52e57efa..35aa2eef6ada 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
@@ -59,6 +59,7 @@ class Accuracy extends EvalMetric("accuracy") {
         }
       }
       this.numInst += predLabel.shape(0)
+      predLabel.dispose()
     }
   }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
index 523093c975f4..281f208843bb 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Executor.scala
@@ -74,7 +74,9 @@ object Executor {
                                       targets: Seq[Array[(Int, Int, NDArray)]]): Unit = {
     for ((src, dTargets) <- data zip targets) {
       for ((start, end, dst) <- dTargets) {
-        src.slice(start, end).copyTo(dst)
+        val sliced = src.slice(start, end)
+        sliced.copyTo(dst)
+        sliced.dispose()
       }
     }
   }
@@ -404,7 +406,7 @@ class DataParallelExecutorManager(symbol: Symbol,
   private val trainExecs =
     ctx.zipWithIndex.map { case (context, i) =>
       val dataShapes =
-        trainData.provideData.map { case (name: String, shape: Shape) =>
+        (trainData.provideData ++ trainData.provideLabel).map { case (name: String, shape: Shape) =>
           (name, Shape(slices(i)._2 - slices(i)._1) ++ shape.drop(1))
         }
       symbol.simpleBind(context, "write", shapeDict = dataShapes)
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
index e11d85fe7cb7..98ce1953243d 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
@@ -39,6 +39,13 @@ class LibInfo {
                            useVars: Array[NDArrayHandle],
                            scalarArgs: Array[MXFloat],
                            mutateVars: Array[NDArrayHandle]): Int
+  @native def mxFuncInvokeEx(function: FunctionHandle,
+                             useVars: Array[NDArrayHandle],
+                             scalarArgs: Array[MXFloat],
+                             mutateVars: Array[NDArrayHandle],
+                             numParams: Int,
+                             paramKeys: Array[Array[Byte]],
+                             paramVals: Array[Array[Byte]]): Int
   @native def mxNDArrayGetShape(handle: NDArrayHandle,
                                 ndim: MXUintRef,
                                 data: ArrayBuffer[Int]): Int
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
index 7f1948b45d31..6807d6a72f31 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
@@ -55,12 +55,12 @@ object NDArray {
         if (output == null) {
           require(acceptEmptyMutate, s"argument out is required to call $funcName")
           output = new NDArray(newEmptyHandle())
+          addDependency(Array(lhs, rhs), Array(output))
         }
         checkCall(_LIB.mxFuncInvoke(handle,
           Array(lhs.handle, rhs.handle),
           Array[MXFloat](),
           Array(output.handle)))
-        addDependency(Array(lhs, rhs), Array(output))
       case _ => throw new IllegalArgumentException(s"call $funcName as binary function")
     }
     output
@@ -76,12 +76,12 @@ object NDArray {
         if (output == null) {
           require(acceptEmptyMutate, s"argument out is required to call $funcName")
           output = new NDArray(newEmptyHandle())
+          addDependency(Array(src), Array(output))
         }
         checkCall(_LIB.mxFuncInvoke(handle,
           Array(src.handle),
           Array[MXFloat](),
           Array(output.handle)))
-        addDependency(Array(src), Array(output))
       case _ => throw new IllegalArgumentException(s"call $funcName as unary function")
     }
     output
@@ -91,14 +91,28 @@ object NDArray {
    * Invoke this function by passing in parameters
    *
    * @param args Positional arguments of input scalars and NDArray
-   * @param out NDArray or tuple of NDArray, optional
+   * @param kwargs: Key-value arguments for functions. e.g.,
+   *            out: NDArray or tuple of NDArray, optional
    *            Output NDArray, used to hold the output result.
    * @return The result NDArray(tuple) of result of computation.
    */
   def invokeGenericFunc(funcName: String,
-                        args: Array[Any],
-                        out: Array[NDArray] = null): Array[NDArray] = {
-    var mutateVars = out
+                        args: Array[Any] = null,
+                        kwargs: Map[String, Any] = null): Array[NDArray] = {
+    var mutateVars: Array[NDArray] = null
+    val realKwargs =
+      if (kwargs != null && kwargs.contains("out")) {
+        val out = kwargs("out")
+        mutateVars =
+          if (out.isInstanceOf[NDArray]) {
+            Array(kwargs("out").asInstanceOf[NDArray])
+          } else {
+            kwargs("out").asInstanceOf[Array[NDArray]]
+          }
+        kwargs - "out"
+      } else {
+        kwargs
+      }
     val function = functions(funcName)
     require(function != null, s"invalid function name $funcName")
     function match {
@@ -109,17 +123,28 @@ object NDArray {
                                   scalarRange: Range) =>
         require(mutateVars == null || nMutateVars == mutateVars.length,
           s"expect $nMutateVars in $funcName")
+        val useVars = useVarsRange.map(args(_).asInstanceOf[NDArray]).toArray
+        val scalarVars = scalarRange.map(args(_).asInstanceOf[MXFloat]).toArray
         if (mutateVars == null) {
           require(acceptEmptyMutate, s"argument out is required to call $funcName")
           mutateVars = Array.fill[NDArray](nMutateVars)(new NDArray(newEmptyHandle()))
+          addDependency(useVars, mutateVars)
         }
-        val useVars = useVarsRange.map(args(_).asInstanceOf[NDArray]).toArray
-        val scalarVars = scalarRange.map(args(_).asInstanceOf[MXFloat]).toArray
-        checkCall(_LIB.mxFuncInvoke(handle,
+        val (numKwargs: Int,
+              kwargKeys: Option[Array[Array[Byte]]],
+              kwargVals: Option[Array[Array[Byte]]]) =
+          if (realKwargs == null) {
+            (0, None, None)
+          } else {
+            (realKwargs.size,
+              Some(realKwargs.keys.map(_.getBytes("ASCII") ++ Array(0.toByte)).toArray),
+              Some(realKwargs.values.map(_.toString.getBytes("ASCII") ++ Array(0.toByte)).toArray))
+          }
+        checkCall(_LIB.mxFuncInvokeEx(handle,
           useVars.map(_.handle),
           scalarVars,
-          mutateVars.map(_.handle).array))
-        addDependency(useVars, mutateVars)
+          mutateVars.map(_.handle).array,
+          numKwargs, kwargKeys.orNull, kwargVals.orNull))
       case _ => throw new IllegalArgumentException(s"call $funcName as generic function")
     }
     mutateVars
@@ -464,11 +489,15 @@ object NDArray {
   }
 
   def randomUniform(low: Float, high: Float, out: NDArray): NDArray = {
-    NDArray.invokeGenericFunc("_random_uniform", Array(low, high), Array(out))(0)
+    require(out != null)
+    NDArray.invokeGenericFunc("_sample_uniform", kwargs = Map[String, Any](
+      "low" -> low, "high" -> high, "shape" -> out.shape, "out" -> out))(0)
   }
 
-  def randomGaussian(mean: Float, stdvar: Float, out: NDArray): NDArray = {
-    NDArray.invokeGenericFunc("_random_gaussian", Array(mean, stdvar), Array(out))(0)
+  def randomGaussian(loc: Float, scale: Float, out: NDArray): NDArray = {
+    require(out != null)
+    NDArray.invokeGenericFunc("_sample_normal", kwargs = Map[String, Any](
+      "loc" -> loc, "scale" -> scale, "shape" -> out.shape, "out" -> out))(0)
   }
 
   /**
@@ -743,7 +772,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    */
   def set(value: Float): NDArray = {
     require(writable, "trying to assign to a readonly NDArray")
-    NDArray.invokeGenericFunc("_set_value", Array[Any](value), out = Array(this))
+    NDArray.invokeGenericFunc("_set_value", Array[Any](value), Map[String, Any]("out" -> this))
     this
   }
 
@@ -777,7 +806,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to add to a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_plus_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_plus_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
@@ -800,7 +830,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to subtract from a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_minus_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_minus_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
@@ -827,7 +858,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to multiply to a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_mul_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_mul_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
@@ -850,7 +882,8 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
     if (!writable) {
       throw new IllegalArgumentException("trying to divide from a readonly NDArray")
     }
-    NDArray.invokeGenericFunc("_div_scalar", Array[Any](this, other), out = Array(this))
+    NDArray.invokeGenericFunc("_div_scalar", Array[Any](this, other),
+      Map[String, Any]("out" -> this))
     this
   }
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
index 7b456b0b4bb2..7c233b1c8988 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
@@ -7,11 +7,19 @@ object Optimizer {
     new MXKVStoreUpdater {
       val states = new scala.collection.mutable.HashMap[Int, AnyRef]
       override def update(index: Int, grad: NDArray, weight: NDArray): Unit = {
-        val state = states.getOrElseUpdate(index, optimizer.createState(index, weight))
+        val state =
+          if (states.contains(index)) {
+            states.get(index).get
+          } else {
+            val newState = optimizer.createState(index, weight)
+            states.put(index, newState)
+            newState
+          }
         optimizer.update(index, weight, grad, state)
       }
       override def dispose(): Unit = {
         states.values.foreach(optimizer.disposeState)
+        states.clear()
       }
     }
   }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
index e1279e095dfa..c66dd32cc6a8 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
@@ -37,15 +37,15 @@ object Random {
   /**
    * Generate normal(Gaussian) distribution N(mean, stdvar^^2) with shape.
    *
-   * @param mean The mean of the normal distribution.
-   * @param stdvar The standard deviation of normal distribution.
+   * @param loc The mean of the normal distribution.
+   * @param scale The standard deviation of normal distribution.
    * @param shape Output shape of the NDArray generated.
    * @param ctx Context of output NDArray, will use default context if not specified.
    * @param out Output place holder
    * @return The result NDArray with generated result.
    */
-  def normal(mean: Float,
-             stdvar: Float,
+  def normal(loc: Float,
+             scale: Float,
              shape: Shape = null,
              ctx: Context = null,
              out: NDArray = null): NDArray = {
@@ -56,7 +56,7 @@ object Random {
       require(shape != null, "shape is required when out is not specified")
       outCopy = empty(shape, ctx)
     }
-    randomGaussian(mean, stdvar, outCopy)
+    randomGaussian(loc, scale, outCopy)
   }
 
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
index 41e9ef1cf9b4..929630065926 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
@@ -29,9 +29,11 @@ class MXDataIter private[mxnet](private[mxnet] val handle: DataIterHandle,
       iterNext()
       val data = currentBatch.data(0)
       val label = currentBatch.label(0)
-      reset()
       // properties
-      (Map(dataName -> data.shape), Map(labelName -> label.shape), data.shape(0))
+      val res = (Map(dataName -> data.shape), Map(labelName -> label.shape), data.shape(0))
+      currentBatch.dispose()
+      reset()
+      res
     } else {
       (null, null, 0)
     }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
index 5ae522658581..8aa821daf628 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
@@ -2,62 +2,161 @@ package ml.dmlc.mxnet.io
 
 import ml.dmlc.mxnet.{DataBatch, DataIter, NDArray, Shape}
 import org.slf4j.LoggerFactory
+import java.util.concurrent.Semaphore
 
 /**
- * TODO
  * Base class for prefetching iterators. Takes one or more DataIters
  * and combine them with prefetching.
  *
+ * @author Depeng Liang
  *
  * @param iters list of DataIters
  * @param dataNames
  * @param labelNames
  */
 class PrefetchingIter(val iters: IndexedSeq[DataIter],
-                      val dataNames: Map[String, String] = null,
-                      val labelNames: Map[String, String] = null) extends DataIter {
+                      val dataNames: IndexedSeq[Map[String, String]] = null,
+                      val labelNames: IndexedSeq[Map[String, String]] = null) extends DataIter {
   private val logger = LoggerFactory.getLogger(classOf[PrefetchingIter])
 
+  require(iters.length > 0, "Iters length must be greater than 0")
+
+  private val _provideData: Map[String, Shape] = {
+    if (dataNames == null) {
+      iters.map(_.provideData).foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    } else {
+      iters.zipWithIndex.map(tu => (tu._1.provideData, tu._2))
+             .map(m => m._1.map(t => (dataNames(m._2)(t._1), t._2)))
+             .foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    }
+  }
+
+  private val _provideLabel: Map[String, Shape] = {
+    if (labelNames == null) {
+      iters.map(_.provideLabel).foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    } else {
+      iters.zipWithIndex.map(tu => (tu._1.provideLabel, tu._2))
+             .map(m => m._1.map(t => (labelNames(m._2)(t._1), t._2)))
+             .foldLeft(Map[String, Shape]()) { (acc, elem) =>
+        acc ++ elem
+      }
+    }
+  }
+
+  private val _batchSize: Int = this._provideData.toList(0)._2(0)
+  private val dataReady: IndexedSeq[Semaphore] =
+                                        (0 until iters.length).map(i => new Semaphore(0))
+  private val dataTaken: IndexedSeq[Semaphore] =
+                                        (0 until iters.length).map(i => new Semaphore(1))
+
+  @volatile private var started: Boolean = true
   private var currentBatch: DataBatch = null
-  private var nextBatch: DataBatch = null
+  private var nextBatch: Array[DataBatch] = (0 until iters.length).map { i =>
+    new DataBatch(null, null, null, 0)
+  }.toArray
+
+  // thread entry
+  def prefetchFunc(i: Int): Runnable = new Runnable {
+    override def run(): Unit = {
+      while (started) {
+        dataTaken(i).acquire()
+        if (started) {
+          try {
+            nextBatch(i) = iters(i).next()
+          } catch {
+            case ex: NoSuchElementException => nextBatch(i) = null
+          }
+        }
+        dataReady(i).release()
+      }
+    }
+  }
+
+  private val prefetchThreads =
+    for (i <- 0 until iters.length) yield new Thread(prefetchFunc(i))
+  prefetchThreads.foreach(_.start())
+
+  override def next(): DataBatch = currentBatch
 
   /**
    * reset the iterator
    */
-  override def reset(): Unit = ???
+  override def reset(): Unit = {
+    for (e <- dataReady) e.acquire()
+    for (i <- iters) i.reset()
+    for (e <- dataTaken) e.release()
+  }
+
+  override def batchSize: Int = this._batchSize
 
   /**
    * get data of current batch
    * @return the data of current batch
    */
-  override def getData(): IndexedSeq[NDArray] = ???
+  override def getData(): IndexedSeq[NDArray] = currentBatch.data
 
   /**
    * Get label of current batch
    * @return the label of current batch
    */
-  override def getLabel(): IndexedSeq[NDArray] = ???
+  override def getLabel(): IndexedSeq[NDArray] = currentBatch.label
 
   /**
    * the index of current batch
    * @return
    */
-  override def getIndex(): IndexedSeq[Long] = ???
+  override def getIndex(): IndexedSeq[Long] = currentBatch.index
 
   // The name and shape of label provided by this iterator
-  override def provideLabel: Map[String, Shape] = ???
+  override def provideLabel: Map[String, Shape] = this._provideLabel
 
   /**
    * get the number of padding examples
    * in current batch
    * @return number of padding examples in current batch
    */
-  override def getPad(): Int = ???
+  override def getPad(): Int = this.currentBatch.pad
 
   // The name and shape of data provided by this iterator
-  override def provideData: Map[String, Shape] = ???
+  override def provideData: Map[String, Shape] = this._provideData
 
-  override def hasNext: Boolean = ???
+  override def hasNext: Boolean = {
+    for (e <- dataReady) e.acquire()
+    if (nextBatch(0) == null) {
+      for (i <- nextBatch) {
+        assert(i == null, "Number of entry mismatches between iterators")
+      }
+      for (e <- dataReady) e.release()
+      false
+    } else {
+      for (batch <- nextBatch) {
+        assert(batch.pad == nextBatch(0).pad,
+            "Number of entry mismatches between iterators")
+      }
+      val datas = for (batch <- nextBatch) yield batch.data
+      val labels = for (batch <- nextBatch) yield batch.label
+      currentBatch = new DataBatch(datas.toIndexedSeq.flatten,
+                                      labels.toIndexedSeq.flatten,
+                                      nextBatch(0).index,
+                                      nextBatch(0).pad)
+      for (e <- dataTaken) e.release()
+      true
+    }
+  }
 
-  override def batchSize: Int = ???
+  /**
+   * Stop all its internal prefetching threads.
+   * The object shall never be used after it is disposed.
+   */
+  def dispose(): Unit = {
+    started = false
+    for (e <- dataTaken) e.release()
+    for (t <- prefetchThreads) t.join()
+  }
 }
diff --git a/scala-package/core/src/test/resources/log4j.properties b/scala-package/core/src/test/resources/log4j.properties
new file mode 100644
index 000000000000..7d7ca36b28a1
--- /dev/null
+++ b/scala-package/core/src/test/resources/log4j.properties
@@ -0,0 +1,7 @@
+# for development debugging
+log4j.rootLogger = debug, stdout
+
+log4j.appender.stdout = org.apache.log4j.ConsoleAppender
+log4j.appender.stdout.Target = System.out
+log4j.appender.stdout.layout = org.apache.log4j.PatternLayout
+log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
index 38d8adf930e1..5fefc0704ba4 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
@@ -1,6 +1,6 @@
 package ml.dmlc.mxnet
 
-import ml.dmlc.mxnet.io.{NDArrayIter, ResizeIter}
+import ml.dmlc.mxnet.io.{NDArrayIter, ResizeIter, PrefetchingIter}
 import org.scalatest.{BeforeAndAfterAll, FunSuite}
 import scala.sys.process._
 
@@ -150,6 +150,69 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     assert(batchCount === nBatch)
   }
 
+  test("test PrefetchIter") {
+    // get data
+    "./scripts/get_mnist_data.sh" !
+
+    val params = Map(
+      "image" -> "data/train-images-idx3-ubyte",
+      "label" -> "data/train-labels-idx1-ubyte",
+      "data_shape" -> "(784,)",
+      "batch_size" -> "100",
+      "shuffle" -> "1",
+      "flat" -> "1",
+      "silent" -> "0",
+      "seed" -> "10"
+    )
+
+    val mnistPack1 = IO.MNISTPack(params)
+    val mnistPack2 = IO.MNISTPack(params)
+
+    val nBatch = 600
+    var batchCount = 0
+
+    val mnistIter1 = mnistPack1.iterator
+    val mnistIter2 = mnistPack2.iterator
+
+    var prefetchIter = new PrefetchingIter(
+        IndexedSeq(mnistIter1, mnistIter2),
+        IndexedSeq(Map("data" -> "data1"), Map("data" -> "data2")),
+        IndexedSeq(Map("label" -> "label1"), Map("label" -> "label2"))
+    )
+
+    // test loop
+    while(prefetchIter.hasNext) {
+      prefetchIter.next()
+      batchCount += 1
+    }
+    assert(nBatch === batchCount)
+
+    // test provideData
+    val provideData = prefetchIter.provideData
+    val provideLabel = prefetchIter.provideLabel
+    assert(provideData("data1") === Shape(100, 784))
+    assert(provideData("data2") === Shape(100, 784))
+    assert(provideLabel("label1") === Shape(100))
+    assert(provideLabel("label2") === Shape(100))
+
+    // test reset
+    prefetchIter.reset()
+    prefetchIter.next()
+    val label0 = prefetchIter.getLabel().head.toArray
+    val data0 = prefetchIter.getData().head.toArray
+    prefetchIter.next()
+    prefetchIter.next()
+    prefetchIter.next()
+    prefetchIter.reset()
+    prefetchIter.next()
+    val label1 = prefetchIter.getLabel().head.toArray
+    val data1 = prefetchIter.getData().head.toArray
+    assert(label0 === label1)
+    assert(data0 === data1)
+
+    prefetchIter.dispose()
+  }
+
   test("test NDArrayIter") {
     val shape0 = Shape(Array(1000, 2, 2))
     val data = IndexedSeq(NDArray.ones(shape0), NDArray.zeros(shape0))
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
index 0f356bfb5fa4..57c065f2e86b 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
@@ -91,6 +91,6 @@ class ConvSuite extends FunSuite with BeforeAndAfterAll {
     }
     val acc = numCorrect.toFloat / numInst
     logger.info(s"Final accuracy = $acc")
-    assert(acc > 0.96)
+    assert(acc > 0.95)
   }
 }
diff --git a/scala-package/examples/scripts/rnn/run_test_charrnn.sh b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
new file mode 100644
index 000000000000..3abe98917a8f
--- /dev/null
+++ b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# you can get the training data file using the following command
+# wget http://data.dmlc.ml/mxnet/data/lab_data.zip
+# unzip -o lab_data.zip
+# for example ./datas/obama.txt
+DATA_PATH=$1
+# for example ./models/obama
+MODEL_PREFIX=$2
+# feel free to change the starter sentence
+STARTER_SENTENCE="The joke"
+
+java -Xmx4G -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.rnn.TestCharRnn \
+	--data-path $DATA_PATH \
+	--model-prefix $MODEL_PREFIX \
+	--starter-sentence "$STARTER_SENTENCE"
diff --git a/scala-package/examples/scripts/rnn/run_train_charrnn.sh b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
new file mode 100644
index 000000000000..04379d33401a
--- /dev/null
+++ b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+
+# which gpu card to use, -1 means cpu
+GPU=$1
+# you can get the training data file using the following command
+# wget http://data.dmlc.ml/mxnet/data/lab_data.zip
+# unzip -o lab_data.zip
+# for example ./datas/obama.txt
+DATA_PATH=$2
+# for example ./models
+SAVE_MODEL_PATH=$3
+
+java -Xmx4G -cp $CLASS_PATH \
+	ml.dmlc.mxnet.examples.rnn.TrainCharRnn \
+	--data-path $DATA_PATH \
+	--save-model-path $SAVE_MODEL_PATH \
+	--gpu $GPU \
diff --git a/scala-package/examples/src/main/resources/log4j.properties b/scala-package/examples/src/main/resources/log4j.properties
index 7d7ca36b28a1..cb92f4c5250a 100644
--- a/scala-package/examples/src/main/resources/log4j.properties
+++ b/scala-package/examples/src/main/resources/log4j.properties
@@ -1,5 +1,5 @@
 # for development debugging
-log4j.rootLogger = debug, stdout
+log4j.rootLogger = info, stdout
 
 log4j.appender.stdout = org.apache.log4j.ConsoleAppender
 log4j.appender.stdout.Target = System.out
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
index d2605a152b4a..97deaf3123b2 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
@@ -15,8 +15,7 @@ object ModelTrain {
           lr: Float = 0.1f, lrFactor: Float = 1f, lrFactorEpoch: Float = 1f,
           clipGradient: Float = 0f, monitorSize: Int = -1): Unit = {
     // kvstore
-    // TODO: if local mode and no gpu is used, set kv = null
-    val kv = KVStore.create(kvStore)
+    var kv = KVStore.create(kvStore)
 
     // load model
     val modelPrefixWithRank =
@@ -62,6 +61,12 @@ object ModelTrain {
         lrScheduler = lrScheduler, clipGradient = clipGradient,
         momentum = 0.9f, wd = 0.00001f)
 
+    // disable kvstore for single device
+    if (kv.`type`.contains("local") && (devs.length == 1 || devs(0).deviceType != "gpu")) {
+      kv.dispose()
+      kv = null
+    }
+
     val model = new FeedForward(ctx = devs,
                                 symbol = network,
                                 numEpoch = numEpochs,
@@ -80,7 +85,9 @@ object ModelTrain {
               kvStore = kv,
               batchEndCallback = new Speedometer(batchSize, 50),
               epochEndCallback = checkpoint)
-    kv.dispose()
+    if (kv != null) {
+      kv.dispose()
+    }
   }
   // scalastyle:on parameterNum
 }
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
index f9acac462f10..44792cf4fc00 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
@@ -102,9 +102,9 @@ object TrainMnist {
         envs.put("DMLC_NUM_WORKER", inst.numWorker.toString)
         require(inst.numServer > 0, "Num of servers must > 0")
         envs.put("DMLC_NUM_SERVER", inst.numServer.toString)
+        logger.info("Init PS environments")
+        KVStoreServer.init(envs.toMap)
       }
-      logger.info("Init PS environments")
-      KVStoreServer.init(envs.toMap)
 
       if (inst.role != "worker") {
         logger.info("Start KVStoreServer for scheduler & servers")
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
new file mode 100644
index 000000000000..9a11f6fa2950
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/ButketIo.scala
@@ -0,0 +1,205 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet.{DataBatch, DataIter, NDArray, Shape}
+import org.slf4j.LoggerFactory
+import scala.io.Source
+import scala.util.Random
+
+/**
+ * @author Depeng Liang
+ */
+object ButketIo {
+
+  type Text2Id = (String, Map[String, Int]) => Array[Int]
+  type ReadContent = String => String
+
+  def defaultReadContent(path: String): String = {
+    val content = Source.fromFile(path).mkString
+                        .replaceAll("\n", " <eos> ")
+                        .replaceAll(". ", " <eos> ")
+    content
+  }
+
+  def defaultText2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
+    val words = {
+      val tmp = sentence.split(" ").filter(_.length() > 0)
+      for (w <- tmp) yield theVocab(w)
+    }
+    words.toArray
+  }
+
+  def defaultGenBuckets(sentences: Array[String], batchSize: Int,
+                        theVocab: Map[String, Int]): List[Int] = {
+    val lenDict = scala.collection.mutable.Map[Int, Int]()
+    var maxLen = -1
+    for (sentence <- sentences) {
+      val wordsLen = defaultText2Id(sentence, theVocab).length
+      if (wordsLen > 0) {
+        if (wordsLen > maxLen) {
+          maxLen = wordsLen
+        }
+        if (lenDict.contains(wordsLen)) {
+          lenDict(wordsLen) = lenDict(wordsLen) + 1
+        } else {
+          lenDict += wordsLen -> 1
+        }
+      }
+    }
+
+    var tl = 0
+    var buckets = List[Int]()
+    lenDict.foreach {
+      case (l, n) =>
+        if (n + tl >= batchSize) {
+          buckets = buckets :+ l
+          tl = 0
+        } else tl += n
+    }
+    if (tl  > 0) buckets = buckets :+ maxLen
+    buckets
+  }
+
+  class BucketSentenceIter(
+      path: String, vocab: Map[String, Int], var buckets: List[Int],
+      _batchSize: Int, initStates: IndexedSeq[(String, (Int, Int))],
+      seperateChar: String = " <eos> ", text2Id: Text2Id = defaultText2Id,
+      readContent: ReadContent = defaultReadContent) extends DataIter {
+
+    private val logger = LoggerFactory.getLogger(classOf[BucketSentenceIter])
+
+    private val content = readContent(path)
+    private val sentences = content.split(seperateChar)
+
+    if (buckets.length == 0) {
+      buckets = defaultGenBuckets(sentences, batchSize, vocab)
+    }
+    buckets = buckets.sorted
+    // pre-allocate with the largest bucket for better memory sharing
+    private val defaultBucketKey = (buckets(0) /: buckets.drop(1)) { (max, elem) =>
+      if (max < elem) elem else max
+    }
+    // we just ignore the sentence it is longer than the maximum
+    // bucket size here
+    private val data = buckets.indices.map(x => Array[Array[Float]]()).toArray
+    for (sentence <- sentences) {
+      val ids = text2Id(sentence, vocab)
+      if (ids.length > 0) {
+        buckets.indices.foreach { idx =>
+          if (buckets(idx) >= ids.length) {
+            data(idx) = data(idx) :+
+            (ids.map(_.toFloat) ++ Array.fill[Float](buckets(idx) - ids.length)(0f))
+          }
+        }
+      }
+    }
+
+    // Get the size of each bucket, so that we could sample
+    // uniformly from the bucket
+    private val bucketSizes = data.map(_.length)
+    logger.info("Summary of dataset ==================")
+    buckets.zip(bucketSizes).foreach {
+      case (bkt, size) => logger.info(s"bucket of len $bkt : $size samples")
+    }
+
+     // make a random data iteration plan
+     // truncate each bucket into multiple of batch-size
+    private var bucketNBatches = Array[Int]()
+    for (i <- data.indices) {
+      bucketNBatches = bucketNBatches :+ (data(i).length / _batchSize)
+      data(i) = data(i).take(bucketNBatches(i) * _batchSize)
+    }
+
+    private val bucketPlan = {
+      val plan = bucketNBatches.zipWithIndex.map(x => Array.fill[Int](x._1)(x._2)).flatten
+      Random.shuffle(plan.toList)
+    }
+
+    private val bucketIdxAll = data.map(_.length).toList
+                                        .map(l => Random.shuffle((0 until l).toList))
+    private val bucketCurrIdx = data.map(x => 0)
+
+    private var dataBuffer = Array[NDArray]()
+    private var labelBuffer = Array[NDArray]()
+    for (iBucket <- data.indices) {
+      dataBuffer = dataBuffer :+ NDArray.zeros(_batchSize, buckets(iBucket))
+      labelBuffer = labelBuffer :+ NDArray.zeros(_batchSize, buckets(iBucket))
+    }
+
+    private val _provideData = {
+      val tmp = Map("data" -> Shape(_batchSize, defaultBucketKey))
+      tmp ++ initStates.map(x => x._1 -> Shape(x._2._1, x._2._2))
+    }
+    private val _provideLabel = Map("softmax_label" -> Shape(_batchSize, defaultBucketKey))
+
+    private var iBucket = 0
+
+    override def next(): DataBatch = {
+      val bucketIdx = bucketPlan(iBucket)
+      val dataBuf = dataBuffer(bucketIdx)
+      val iIdx = bucketCurrIdx(bucketIdx)
+      val idx = bucketIdxAll(bucketIdx).drop(iIdx).take(_batchSize)
+      bucketCurrIdx(bucketIdx) = bucketCurrIdx(bucketIdx) + _batchSize
+
+      val datas = idx.map(i => data(bucketIdx)(i)).toArray
+      for (sentence <- datas) {
+        assert(sentence.length == buckets(bucketIdx))
+      }
+      dataBuf.set(datas.flatten)
+
+      val labelBuf = labelBuffer(bucketIdx)
+      val labels = idx.map(i => data(bucketIdx)(i).drop(1) :+ 0f).toArray
+      labelBuf.set(labels.flatten)
+
+      iBucket += 1
+      new DataBatch(IndexedSeq(dataBuf),
+                    IndexedSeq(labelBuf),
+                    getIndex(),
+                    getPad())
+    }
+
+    /**
+     * reset the iterator
+     */
+    override def reset(): Unit = {
+      iBucket = 0
+      bucketCurrIdx.indices.map(i => bucketCurrIdx(i) = 0)
+    }
+
+    override def batchSize: Int = _batchSize
+
+    /**
+     * get data of current batch
+     * @return the data of current batch
+     */
+    override def getData(): IndexedSeq[NDArray] = IndexedSeq(dataBuffer(bucketPlan(iBucket)))
+
+    /**
+     * Get label of current batch
+     * @return the label of current batch
+     */
+    override def getLabel(): IndexedSeq[NDArray] = IndexedSeq(labelBuffer(bucketPlan(iBucket)))
+
+    /**
+     * the index of current batch
+     * @return
+     */
+    override def getIndex(): IndexedSeq[Long] = IndexedSeq[Long]()
+
+    // The name and shape of label provided by this iterator
+    override def provideLabel: Map[String, Shape] = this._provideLabel
+
+    /**
+     * get the number of padding examples
+     * in current batch
+     * @return number of padding examples in current batch
+     */
+    override def getPad(): Int = 0
+
+    // The name and shape of data provided by this iterator
+    override def provideData: Map[String, Shape] = this._provideData
+
+    override def hasNext: Boolean = {
+      if (iBucket < bucketPlan.length) true else false
+    }
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
new file mode 100644
index 000000000000..0e2e5f7de66b
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
@@ -0,0 +1,148 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.Executor
+
+/**
+ * @author Depeng Liang
+ */
+object Lstm {
+
+  final case class LSTMState(c: Symbol, h: Symbol)
+  final case class LSTMParam(i2hWeight: Symbol, i2hBias: Symbol,
+                             h2hWeight: Symbol, h2hBias: Symbol)
+
+  // LSTM Cell symbol
+  def lstm(numHidden: Int, inData: Symbol, prevState: LSTMState,
+           param: LSTMParam, seqIdx: Int, layerIdx: Int, dropout: Float = 0f): LSTMState = {
+    val inDataa = {
+      if (dropout > 0f) Symbol.Dropout()(Map("data" -> inData, "p" -> dropout))
+      else inData
+    }
+    val i2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_i2h")(Map("data" -> inDataa,
+                                                       "weight" -> param.i2hWeight,
+                                                       "bias" -> param.i2hBias,
+                                                       "num_hidden" -> numHidden * 4))
+    val h2h = Symbol.FullyConnected(s"t${seqIdx}_l${layerIdx}_h2h")(Map("data" -> prevState.h,
+                                                       "weight" -> param.h2hWeight,
+                                                       "bias" -> param.h2hBias,
+                                                       "num_hidden" -> numHidden * 4))
+    val gates = i2h + h2h
+    val sliceGates = Symbol.SliceChannel(s"t${seqIdx}_l${layerIdx}_slice")(Array(gates),
+        Map("num_outputs" -> 4))
+    val ingate = Symbol.Activation()(Map("data" -> sliceGates.get(0), "act_type" -> "sigmoid"))
+    val inTransform = Symbol.Activation()(Map("data" -> sliceGates.get(1), "act_type" -> "tanh"))
+    val forgetGate = Symbol.Activation()(Map("data" -> sliceGates.get(2), "act_type" -> "sigmoid"))
+    val outGate = Symbol.Activation()(Map("data" -> sliceGates.get(3), "act_type" -> "sigmoid"))
+    val nextC = (forgetGate * prevState.c) + (ingate * inTransform)
+    val nextH = outGate * Symbol.Activation()(Map("data" -> nextC, "act_type" -> "tanh"))
+    LSTMState(c = nextC, h = nextH)
+  }
+
+  // we define a new unrolling function here because the original
+  // one in lstm.py concats all the labels at the last layer together,
+  // making the mini-batch size of the label different from the data.
+  // I think the existing data-parallelization code need some modification
+  // to allow this situation to work properly
+  def lstmUnroll(numLstmLayer: Int, seqLen: Int, inputSize: Int, numHidden: Int,
+                 numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
+    val embedWeight = Symbol.Variable("embed_weight")
+    val clsWeight = Symbol.Variable("cls_weight")
+    val clsBias = Symbol.Variable("cls_bias")
+
+    var paramCells = Array[LSTMParam]()
+    var lastStates = Array[LSTMState]()
+    for (i <- 0 until numLstmLayer) {
+      paramCells = paramCells :+ LSTMParam(i2hWeight = Symbol.Variable(s"l${i}_i2h_weight"),
+                                           i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
+                                           h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
+                                           h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
+      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
+                                           h = Symbol.Variable(s"l${i}_init_h"))
+    }
+    assert(lastStates.length == numLstmLayer)
+
+    // embeding layer
+    val data = Symbol.Variable("data")
+    var label = Symbol.Variable("softmax_label")
+    val embed = Symbol.Embedding("embed")(Map("data" -> data, "input_dim" -> inputSize,
+                                           "weight" -> embedWeight, "output_dim" -> numEmbed))
+    val wordvec = Symbol.SliceChannel()(Array(embed),
+      Map("num_outputs" -> seqLen, "squeeze_axis" -> true))
+
+    var hiddenAll = Array[Symbol]()
+    var dpRatio = 0f
+    var hidden: Symbol = null
+    for (seqIdx <- 0 until seqLen) {
+      hidden = wordvec.get(seqIdx)
+      // stack LSTM
+      for (i <- 0 until numLstmLayer) {
+        if (i == 0) dpRatio = 0f else dpRatio = dropout
+        val nextState = lstm(numHidden, inData = hidden,
+                             prevState = lastStates(i),
+                             param = paramCells(i),
+                             seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
+        hidden = nextState.h
+        lastStates(i) = nextState
+      }
+      // decoder
+      if (dropout > 0f) hidden = Symbol.Dropout()(Map("data" -> hidden, "p" -> dropout))
+      hiddenAll = hiddenAll :+ hidden
+    }
+    val hiddenConcat = Symbol.Concat()(hiddenAll, Map("dim" -> 0))
+    val pred = Symbol.FullyConnected("pred")(Map("data" -> hiddenConcat, "num_hidden" -> numLabel,
+                                            "weight" -> clsWeight, "bias" -> clsBias))
+    label = Symbol.transpose(label)
+    label = Symbol.Reshape()(Map("data" -> label, "target_shape" -> "(0,)"))
+    val sm = Symbol.SoftmaxOutput("softmax")(Map("data" -> pred, "label" -> label))
+    sm
+  }
+
+  def lstmInferenceSymbol(numLstmLayer: Int, inputSize: Int, numHidden: Int,
+                          numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
+    val seqIdx = 0
+    val embedWeight = Symbol.Variable("embed_weight")
+    val clsWeight = Symbol.Variable("cls_weight")
+    val clsBias = Symbol.Variable("cls_bias")
+
+    var paramCells = Array[LSTMParam]()
+    var lastStates = Array[LSTMState]()
+    for (i <- 0 until numLstmLayer) {
+      paramCells = paramCells :+ LSTMParam(i2hWeight = Symbol.Variable(s"l${i}_i2h_weight"),
+                                           i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
+                                           h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
+                                           h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
+      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
+                                           h = Symbol.Variable(s"l${i}_init_h"))
+    }
+    assert(lastStates.length == numLstmLayer)
+
+    val data = Symbol.Variable("data")
+
+    var hidden = Symbol.Embedding("embed")(Map("data" -> data, "input_dim" -> inputSize,
+                                           "weight" -> embedWeight, "output_dim" -> numEmbed))
+
+    var dpRatio = 0f
+    // stack LSTM
+    for (i <- 0 until numLstmLayer) {
+      if (i == 0) dpRatio = 0f else dpRatio = dropout
+      val nextState = lstm(numHidden, inData = hidden,
+                           prevState = lastStates(i),
+                           param = paramCells(i),
+                           seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
+      hidden = nextState.h
+      lastStates(i) = nextState
+    }
+    // decoder
+    if (dropout > 0f) hidden = Symbol.Dropout()(Map("data" -> hidden, "p" -> dropout))
+    val fc = Symbol.FullyConnected("pred")(Map("data" -> hidden, "num_hidden" -> numLabel,
+                                      "weight" -> clsWeight, "bias" -> clsBias))
+    val sm = Symbol.SoftmaxOutput("softmax")(Map("data" -> fc))
+    var output = Array(sm)
+    for (state <- lastStates) {
+      output = output :+ state.c
+      output = output :+ state.h
+    }
+    Symbol.Group(output: _*)
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
new file mode 100644
index 000000000000..b91835f7d076
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
@@ -0,0 +1,55 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet.Context
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.Shape
+import ml.dmlc.mxnet.Symbol
+
+object RnnModel {
+  class LSTMInferenceModel(numLstmLayer: Int, inputSize: Int, numHidden: Int,
+                           numEmbed: Int, numLabel: Int, argParams: Map[String, NDArray],
+                           ctx: Context = Context.cpu(), dropout: Float = 0f) {
+    private val sym = Lstm.lstmInferenceSymbol(numLstmLayer,
+                                               inputSize,
+                                               numHidden,
+                                               numEmbed,
+                                               numLabel,
+                                               dropout)
+    private val batchSize = 1
+    private val initC = (for (l <- 0 until numLstmLayer)
+                          yield (s"l${l}_init_c" -> Shape(batchSize, numHidden))).toMap
+    private val initH = (for (l <- 0 until numLstmLayer)
+                          yield (s"l${l}_init_h" -> Shape(batchSize, numHidden))).toMap
+    private val dataShape = Map("data" -> Shape(batchSize))
+    private val inputShape = initC ++ initH ++ dataShape
+    private val executor = sym.simpleBind(ctx = ctx, shapeDict = inputShape)
+
+    for (key <- this.executor.argDict.keys) {
+      if (!inputShape.contains(key) && argParams.contains(key) && key != "softmax_label") {
+        argParams(key).copyTo(this.executor.argDict(key))
+      }
+    }
+
+    private var stateName = (Array[String]() /: (0 until numLstmLayer)) { (acc, i) =>
+      acc :+ s"l${i}_init_c"  :+ s"l${i}_init_h"
+    }
+
+    private val statesDict = stateName.zip(this.executor.outputs.drop(1)).toMap
+    private val inputArr = NDArray.zeros(dataShape("data"))
+
+    def forward(inputData: NDArray, newSeq: Boolean = false): Array[Float] = {
+      if (newSeq == true) {
+        for (key <- this.statesDict.keys) {
+          this.executor.argDict(key).set(0f)
+        }
+      }
+      inputData.copyTo(this.executor.argDict("data"))
+      this.executor.forward()
+      for (key <- this.statesDict.keys) {
+        this.statesDict(key).copyTo(this.executor.argDict(key))
+      }
+      val prob = this.executor.outputs(0).toArray
+      prob
+    }
+  }
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
new file mode 100644
index 000000000000..a3351ff12557
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
@@ -0,0 +1,88 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet._
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+
+/**
+ * Follows the demo, to test the char rnn:
+ * https://github.com/dmlc/mxnet/blob/master/example/rnn/char-rnn.ipynb
+ * @author Depeng Liang
+ */
+object TestCharRnn {
+
+  private val logger = LoggerFactory.getLogger(classOf[TrainCharRnn])
+
+  def main(args: Array[String]): Unit = {
+    val stcr = new TestCharRnn
+    val parser: CmdLineParser = new CmdLineParser(stcr)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(stcr.dataPath != null && stcr.modelPrefix != null && stcr.starterSentence != null)
+
+      // The batch size for training
+      val batchSize = 32
+      // We can support various length input
+      // For this problem, we cut each input sentence to length of 129
+      // So we only need fix length bucket
+      val buckets = List(129)
+      // hidden unit in LSTM cell
+      val numHidden = 512
+      // embedding dimension, which is, map a char to a 256 dim vector
+      val numEmbed = 256
+      // number of lstm layer
+      val numLstmLayer = 3
+
+      // build char vocabluary from input
+      val vocab = Utils.buildVocab(stcr.dataPath)
+
+      // load from check-point
+      val (_, argParams, _) = Model.loadCheckpoint(stcr.modelPrefix, 75)
+
+      // build an inference model
+      val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1,
+                           numHidden = numHidden, numEmbed = numEmbed,
+                           numLabel = vocab.size + 1, argParams = argParams, dropout = 0.2f)
+
+      // generate a sequence of 1200 chars
+      val seqLength = 1200
+      val inputNdarray = NDArray.zeros(1)
+      val revertVocab = Utils.makeRevertVocab(vocab)
+
+      // Feel free to change the starter sentence
+      var output = stcr.starterSentence
+      val randomSample = true
+      var newSentence = true
+      val ignoreLength = output.length()
+
+      for (i <- 0 until seqLength) {
+        if (i <= ignoreLength - 1) Utils.makeInput(output(i), vocab, inputNdarray)
+        else Utils.makeInput(output.takeRight(1)(0), vocab, inputNdarray)
+        val prob = model.forward(inputNdarray, newSentence)
+        newSentence = false
+        val nextChar = Utils.makeOutput(prob, revertVocab, randomSample)
+        if (nextChar == "") newSentence = true
+        if (i >= ignoreLength) output = output ++ nextChar
+      }
+
+      // Let's see what we can learned from char in Obama's speech.
+      logger.info(output)
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class TestCharRnn {
+  @Option(name = "--data-path", usage = "the input train data file")
+  private val dataPath: String = null
+  @Option(name = "--model-prefix", usage = "the model prefix")
+  private val modelPrefix: String = null
+  @Option(name = "--starter-sentence", usage = "the starter sentence")
+  private val starterSentence: String = null
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
new file mode 100644
index 000000000000..ef15636f836b
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
@@ -0,0 +1,160 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import ml.dmlc.mxnet._
+import org.kohsuke.args4j.{CmdLineParser, Option}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+import ml.dmlc.mxnet.optimizer.Adam
+
+/**
+ * Follows the demo, to train the char rnn:
+ * https://github.com/dmlc/mxnet/blob/master/example/rnn/char-rnn.ipynb
+ * @author Depeng Liang
+ */
+object TrainCharRnn {
+
+  private val logger = LoggerFactory.getLogger(classOf[TrainCharRnn])
+
+  def main(args: Array[String]): Unit = {
+    val incr = new TrainCharRnn
+    val parser: CmdLineParser = new CmdLineParser(incr)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      assert(incr.dataPath != null && incr.saveModelPath != null)
+
+      // The batch size for training
+      val batchSize = 32
+      // We can support various length input
+      // For this problem, we cut each input sentence to length of 129
+      // So we only need fix length bucket
+      val buckets = List(129)
+      // hidden unit in LSTM cell
+      val numHidden = 512
+      // embedding dimension, which is, map a char to a 256 dim vector
+      val numEmbed = 256
+      // number of lstm layer
+      val numLstmLayer = 3
+      // we will show a quick demo in 2 epoch
+      // and we will see result by training 75 epoch
+      val numEpoch = 75
+      // learning rate
+      val learningRate = 0.001f
+      // we will use pure sgd without momentum
+      val momentum = 0.0f
+
+      val ctx = if (incr.gpu == -1) Context.cpu() else Context.gpu(incr.gpu)
+      val vocab = Utils.buildVocab(incr.dataPath)
+
+      // generate symbol for a length
+      def symGen(seqLen: Int): Symbol = {
+        Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size + 1,
+                    numHidden = numHidden, numEmbed = numEmbed,
+                    numLabel = vocab.size + 1, dropout = 0.2f)
+      }
+
+      // initalize states for LSTM
+      val initC = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_c", (batchSize, numHidden))
+      val initH = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_h", (batchSize, numHidden))
+      val initStates = initC ++ initH
+
+      val dataTrain = new ButketIo.BucketSentenceIter(incr.dataPath, vocab, buckets,
+                                          batchSize, initStates, seperateChar = "\n",
+                                          text2Id = Utils.text2Id, readContent = Utils.readContent)
+
+      // the network symbol
+      val symbol = symGen(buckets(0))
+
+      val datasAndLabels = dataTrain.provideData ++ dataTrain.provideLabel
+      val (argShapes, outputShapes, auxShapes) = symbol.inferShape(datasAndLabels)
+
+      val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
+
+      val argNames = symbol.listArguments()
+      val argDict = argNames.zip(argShapes.map(NDArray.zeros(_, ctx))).toMap
+      val auxNames = symbol.listAuxiliaryStates()
+      val auxDict = auxNames.zip(auxShapes.map(NDArray.zeros(_, ctx))).toMap
+
+      val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
+        !datasAndLabels.contains(name)
+      }.map(x => x._1 -> NDArray.empty(x._2, ctx) ).toMap
+
+      argDict.foreach { case (name, ndArray) =>
+        if (!datasAndLabels.contains(name)) {
+          initializer.initWeight(name, ndArray)
+        }
+      }
+
+      val data = argDict("data")
+      val label = argDict("softmax_label")
+
+      val executor = symbol.bind(ctx, argDict, gradDict)
+
+      val opt = new Adam(learningRate = learningRate, wd = 0.0001f)
+
+      val paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
+        (idx, name, grad, opt.createState(idx, argDict(name)))
+      }
+
+      val evalMetric = new CustomMetric(Utils.perplexity, "perplexity")
+      val batchEndCallback = new Callback.Speedometer(batchSize, 50)
+      val epochEndCallback = Utils.doCheckpoint(s"${incr.saveModelPath}/obama")
+
+      for (epoch <- 0 until numEpoch) {
+        // Training phase
+        val tic = System.currentTimeMillis
+        evalMetric.reset()
+        var nBatch = 0
+        var epochDone = false
+        // Iterate over training data.
+        dataTrain.reset()
+        while (!epochDone) {
+          var doReset = true
+          while (doReset && dataTrain.hasNext) {
+            val dataBatch = dataTrain.next()
+
+            data.set(dataBatch.data(0))
+            label.set(dataBatch.label(0))
+            executor.forward(isTrain = true)
+            executor.backward()
+            paramsGrads.foreach { case (idx, name, grad, optimState) =>
+              opt.update(idx, argDict(name), grad, optimState)
+            }
+
+            // evaluate at end, so out_cpu_array can lazy copy
+            evalMetric.update(dataBatch.label, executor.outputs)
+
+            nBatch += 1
+            batchEndCallback.invoke(epoch, nBatch, evalMetric)
+          }
+          if (doReset) {
+            dataTrain.reset()
+          }
+          // this epoch is done
+          epochDone = true
+        }
+        val (name, value) = evalMetric.get
+        logger.info(s"Epoch[$epoch] Train-$name=$value")
+        val toc = System.currentTimeMillis
+        logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
+
+        epochEndCallback.invoke(epoch, symbol, argDict, auxDict)
+      }
+      executor.dispose()
+    } catch {
+      case ex: Exception => {
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+      }
+    }
+  }
+}
+
+class TrainCharRnn {
+  @Option(name = "--data-path", usage = "the input train data file")
+  private val dataPath: String = null
+  @Option(name = "--save-model-path", usage = "the model saving path")
+  private val saveModelPath: String = null
+  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
+  private val gpu: Int = -1
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
new file mode 100644
index 000000000000..9fba8c1fd3e4
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
@@ -0,0 +1,137 @@
+package ml.dmlc.mxnet.examples.rnn
+
+import scala.io.Source
+import ml.dmlc.mxnet.EvalMetric
+import ml.dmlc.mxnet.NDArray
+import ml.dmlc.mxnet.EpochEndCallback
+import ml.dmlc.mxnet.Model
+import ml.dmlc.mxnet.Symbol
+import scala.util.Random
+
+/**
+ * @author Depeng Liang
+ */
+object Utils {
+
+  def readContent(path: String): String = Source.fromFile(path).mkString
+
+  // Build  a vocabulary of what char we have in the content
+  def buildVocab(path: String): Map[String, Int] = {
+    val content = readContent(path).split("\n")
+    var idx = 1 // 0 is left for zero padding
+    var theVocab = Map[String, Int]()
+    for (line <- content) {
+      for (char <- line) {
+        val key = s"$char"
+        if (!theVocab.contains(key)) {
+          theVocab = theVocab + (key -> idx)
+          idx += 1
+        }
+      }
+    }
+    theVocab
+  }
+
+  // We will assign each char with a special numerical id
+  def text2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
+    val words = for (char <- sentence) yield theVocab(s"$char")
+    words.toArray
+  }
+
+  // Evaluation
+  def perplexity(label: NDArray, pred: NDArray): Float = {
+    val shape = label.shape
+    val size = shape(0) * shape(1)
+    val labelT = {
+      val tmp = label.toArray.grouped(shape(1)).toArray
+      val result = Array.fill[Float](size)(0f)
+      var idx = 0
+      for (i <- 0 until shape(1)) {
+        for (j <- 0 until shape(0)) {
+          result(idx) = tmp(j)(i)
+          idx += 1
+        }
+      }
+      result
+    }
+    var loss = 0f
+    val predArray = pred.toArray.grouped(pred.shape(1)).toArray
+    for (i <- 0 until pred.shape(0)) {
+      loss += -Math.log(Math.max(1e-10, predArray(i)(labelT(i).toInt)).toFloat).toFloat
+    }
+    loss / size
+  }
+
+  def doCheckpoint(prefix: String): EpochEndCallback = new EpochEndCallback {
+    override def invoke(epoch: Int, symbol: Symbol,
+                        argParams: Map[String, NDArray],
+                        auxStates: Map[String, NDArray]): Unit = {
+      Model.saveCheckpoint(prefix, epoch + 1, symbol, argParams, auxStates)
+    }
+  }
+
+  // helper strcuture for prediction
+  def makeRevertVocab(vocab: Map[String, Int]): Map[Int, String] = {
+    var dic = Map[Int, String]()
+    vocab.foreach { case (k, v) =>
+      dic = dic + (v -> k)
+    }
+    dic
+  }
+
+  // make input from char
+  def makeInput(char: Char, vocab: Map[String, Int], arr: NDArray): Unit = {
+    val idx = vocab(s"$char")
+    val tmp = NDArray.zeros(1)
+    tmp.set(idx)
+    arr.set(tmp)
+  }
+
+  // helper function for random sample
+  def cdf(weights: Array[Float]): Array[Float] = {
+    val total = weights.sum
+    var result = Array[Float]()
+    var cumsum = 0f
+    for (w <- weights) {
+      cumsum += w
+      result = result :+ (cumsum / total)
+    }
+    result
+  }
+
+  def choice(population: Array[String], weights: Array[Float]): String = {
+    assert(population.length == weights.length)
+    val cdfVals = cdf(weights)
+    val x = Random.nextFloat()
+    var idx = 0
+    var found = false
+    for (i <- 0 until cdfVals.length) {
+      if (cdfVals(i) >= x && !found) {
+        idx = i
+        found = true
+      }
+    }
+    population(idx)
+  }
+
+  // we can use random output or fixed output by choosing largest probability
+  def makeOutput(prob: Array[Float], vocab: Map[Int, String],
+                 sample: Boolean = false, temperature: Float = 1f): String = {
+    var idx = -1
+    val char = if (sample == false) {
+      idx = ((-1f, -1) /: prob.zipWithIndex) { (max, elem) =>
+        if (max._1 < elem._1) elem else max
+      }._2
+      if (vocab.contains(idx)) vocab(idx)
+      else ""
+    } else {
+      val fixDict = Array("") ++ (1 until vocab.size + 1).map(i => vocab(i))
+      var scaleProb = prob.map(x => if (x < 1e-6) 1e-6 else if (x > 1 - 1e-6) 1 - 1e-6 else x)
+      var rescale = scaleProb.map(x => Math.exp(Math.log(x) / temperature).toFloat)
+      val sum = rescale.sum.toFloat
+      rescale = rescale.map(_ / sum)
+      choice(fixDict, rescale)
+    }
+    char
+  }
+}
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index 5e00481ae416..394176487172 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -139,6 +139,54 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxFuncInvoke
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxFuncInvokeEx
+  (JNIEnv *env, jobject obj, jlong funcPtr, jlongArray useVars,
+    jfloatArray scalarArgs, jlongArray mutateVars,
+    jint numParams, jobjectArray paramKeys, jobjectArray paramVals) {
+  jlong *cUseVars = env->GetLongArrayElements(useVars, NULL);
+  jfloat *cScalarArgs = env->GetFloatArrayElements(scalarArgs, NULL);
+  jlong *cMutateVars = env->GetLongArrayElements(mutateVars, NULL);
+  jbyte **cParamKeys = NULL;
+  jbyte **cParamVals = NULL;
+  if (numParams > 0) {
+    cParamKeys = new jbyte *[numParams];
+    cParamVals = new jbyte *[numParams];
+    for (size_t i = 0; i < numParams; i++) {
+      jbyteArray jkey = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramKeys, i));
+      jbyte *cParamKey = env->GetByteArrayElements(jkey, NULL);
+      cParamKeys[i] = cParamKey;
+      env->DeleteLocalRef(jkey);
+      jbyteArray jval = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramVals, i));
+      jbyte *cParamVal = env->GetByteArrayElements(jval, NULL);
+      cParamVals[i] = cParamVal;
+      env->DeleteLocalRef(jval);
+    }
+  }
+  int ret = MXFuncInvokeEx(reinterpret_cast<FunctionHandle>(funcPtr),
+                           reinterpret_cast<NDArrayHandle *>(cUseVars),
+                           reinterpret_cast<mx_float *>(cScalarArgs),
+                           reinterpret_cast<NDArrayHandle *>(cMutateVars),
+                           static_cast<int>(numParams),
+                           reinterpret_cast<char **>(cParamKeys),
+                           reinterpret_cast<char **>(cParamVals));
+  env->ReleaseLongArrayElements(useVars, cUseVars, 0);
+  env->ReleaseFloatArrayElements(scalarArgs, cScalarArgs, 0);
+  env->ReleaseLongArrayElements(mutateVars, cMutateVars, 0);
+  if (numParams > 0) {
+    for (size_t i = 0; i < numParams; i++) {
+      jbyteArray jkey = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramKeys, i));
+      env->ReleaseByteArrayElements(jkey, cParamKeys[i], 0);
+      env->DeleteLocalRef(jkey);
+      jbyteArray jval = reinterpret_cast<jbyteArray>(env->GetObjectArrayElement(paramVals, i));
+      env->ReleaseByteArrayElements(jval, cParamVals[i], 0);
+      env->DeleteLocalRef(jval);
+    }
+    delete[] cParamKeys;
+    delete[] cParamVals;
+  }
+  return ret;
+}
+
 JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxNDArraySaveRawBytes
   (JNIEnv *env, jobject obj, jlong ndArrayPtr, jobject dataBuf) {
   size_t length;
diff --git a/src/common/mxrtc.cc b/src/common/mxrtc.cc
index 4fd687267409..c1ab065db627 100644
--- a/src/common/mxrtc.cc
+++ b/src/common/mxrtc.cc
@@ -7,7 +7,7 @@
 #include <mxnet/mxrtc.h>
 #if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
 namespace mxnet {
-const std::string MXRtc::str_type = "float";
+const char MXRtc::str_type[] = "float";
 std::unordered_map<std::string, char*> MXRtc::kernel_registry;
 
 MXRtc::MXRtc(const std::string& name,
@@ -79,36 +79,36 @@ std::string MXRtc::decorate(const std::string& name,
                          std::vector<std::pair<std::string, NDArray> > const& output,
                          const std::string kernel) {
     std::string source;
-    source += "\nextern \"C\" __global__ void " + name + "(";
+    source = source + "\nextern \"C\" __global__ void " + name + "(";
     for (auto &i : input) {
-        source += "const " + str_type + "* " + i.first + ",";
+        source = source + "const " + str_type + "* " + i.first + ",";
     }
     for (auto &i : output) {
-        source += str_type + "* " + i.first + ",";
+        source = source + str_type + "* " + i.first + ",";
     }
     source.pop_back();
     source = source + ") {\n";
     for (auto &i : input) {
-        source += "const int " + i.first + "_ndim = " +
+        source = source + "const int " + i.first + "_ndim = " +
                   std::to_string(i.second.shape().ndim()) + ";\n";
-        source += "const int " + i.first + "_dims[] = {";
+        source = source + "const int " + i.first + "_dims[] = {";
         for (index_t j = 0; j < i.second.shape().ndim(); ++j) {
-            source += std::to_string(i.second.shape()[j]) + ",";
+            source = source + std::to_string(i.second.shape()[j]) + ",";
         }
         source.pop_back();
-        source += "};\n";
+        source = source + "};\n";
     }
     for (auto &i : output) {
-        source += "const int " + i.first + "_ndim = " +
+        source = source + "const int " + i.first + "_ndim = " +
                   std::to_string(i.second.shape().ndim()) + ";\n";
-        source += "const int " + i.first + "_dims[] = {";
+        source = source + "const int " + i.first + "_dims[] = {";
         for (index_t j = 0; j < i.second.shape().ndim(); ++j) {
-            source += std::to_string(i.second.shape()[j]) + ",";
+            source = source + std::to_string(i.second.shape()[j]) + ",";
         }
         source.pop_back();
-        source += "};\n";
+        source = source + "};\n";
     }
-    source += kernel + "\n}\n";
+    source = source + kernel + "\n}\n";
     return source;
 }
 
diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index 0de025ba9a35..95000fccae29 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -23,14 +23,16 @@ KVStore* KVStore::Create(const char *type_name) {
       tname == "local_allreduce_cpu") {
     kv =  new kvstore::KVStoreLocal();
   } else if (tname == "device" ||
+             tname == "local_update_device" ||
              tname == "local_allreduce_device") {
-    tname = "local_allreduce_device";
-    kv = new kvstore::KVStoreDevice();
+    kv = new kvstore::KVStoreDevice(true);
   } else if (tname == "dist_async" ||
              tname == "dist_sync" ||
+             tname == "dist_sync_device" ||
              tname == "dist") {
 #if MXNET_USE_DIST_KVSTORE
-    kv = new kvstore::KVStoreDist();
+    kv = new kvstore::KVStoreDist(
+        tname.find("device") != std::string::npos);
     if (tname == "dist_sync" &&
         kv->IsWorkerNode() &&
         kv->get_rank() == 0) {
diff --git a/src/kvstore/kvstore_device.h b/src/kvstore/kvstore_device.h
index 2667df523272..82c04f9ec337 100644
--- a/src/kvstore/kvstore_device.h
+++ b/src/kvstore/kvstore_device.h
@@ -21,6 +21,10 @@ namespace kvstore {
  * \brief Device implementation of KVStore that do reduction on GPU reduction.
  */
 class KVStoreDevice : public KVStoreLocal {
+ public:
+  explicit KVStoreDevice(bool device_mode)
+      : device_mode_(device_mode) {}
+
  protected:
   using KeyShape = std::pair<int, TShape>;
   void Init(const std::vector<int>& keys,
@@ -57,27 +61,27 @@ class KVStoreDevice : public KVStoreLocal {
         }
       }
 
-      tm_buf.merged = NDArray(s, tm_buf.ctx);
+      tm_buf.merged = NDArray(s, Context::CPUPinned(tm_buf.ctx.dev_id));
+      tm_buf.merged_device = NDArray(s, tm_buf.ctx);
       ctx_info[tm_buf.ctx.dev_id].second += s.Size();
     }
   }
 
   const NDArray& MergePushValue(
       int key, const std::vector<NDArray>& val, int priority) override {
-    if (updater_ != nullptr) {
-      // fall back to CPU based update if updater presents
+    if (!device_mode_) {
       return KVStoreLocal::MergePushValue(key, val, priority);
     }
-
-    if (merge_buf_.empty()) {
+    if (!buf_initialized_) {
       InitMergeBuffers(val);
+      buf_initialized_ = true;
     }
 
     auto& buf = merge_buf_[key];
     std::vector<NDArray> reduce(val.size());
-    CHECK(!buf.merged.is_none());
-    CopyFromTo(val[0], &(buf.merged), priority);
-    reduce[0] = buf.merged;
+    CHECK(!buf.merged_device.is_none());
+    CopyFromTo(val[0], &(buf.merged_device), priority);
+    reduce[0] = buf.merged_device;
 
     for (size_t i = 1; i < val.size(); ++i) {
       NDArray *copy_buf = buf.AllocCopyBuf(
@@ -85,11 +89,45 @@ class KVStoreDevice : public KVStoreLocal {
       CopyFromTo(val[i], copy_buf, priority);
       reduce[i] = *copy_buf;
     }
-    ElementwiseSum(reduce, &buf.merged);
-    return buf.merged;
+    ElementwiseSum(reduce, &buf.merged_device);
+
+    if (updater_ != nullptr) {
+      CopyFromTo(buf.merged_device, &(buf.merged));
+      return buf.merged;
+    } else {
+      return buf.merged_device;
+    }
+  }
+
+  void ScatterPullValue(
+      int key,
+      const NDArray& src,
+      const std::vector<NDArray*>& vals,
+      int priority) override {
+    if (!device_mode_) {
+      KVStoreLocal::ScatterPullValue(key, src, vals, priority);
+      return;
+    }
+    auto it = merge_buf_.find(key);
+    if (it != merge_buf_.end() && it->first == key) {
+      auto& buf = it->second;
+      if (!buf.merged_device.is_none()) {
+        CopyFromTo(src, &(buf.merged_device));
+        for (auto* vptr : vals) {
+          CopyFromTo(buf.merged_device, vptr, priority);
+        }
+        return;
+      }
+    }
+    // default, copy back
+    for (auto* vptr : vals) {
+      CopyFromTo(src, vptr, priority);
+    }
   }
 
  private:
+  bool device_mode_;
+  bool buf_initialized_{false};
   std::vector<KeyShape> sorted_key_shape_;
 };
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index 270d85101d9f..2705effe0104 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -7,7 +7,7 @@
 #define MXNET_KVSTORE_KVSTORE_DIST_H_
 #include <string>
 #include <vector>
-#include "./kvstore_local.h"
+#include "./kvstore_device.h"
 #include "mxnet/engine.h"
 #include "ps/ps.h"
 #include "./kvstore_dist_server.h"
@@ -25,9 +25,11 @@ namespace kvstore {
  * it's the server node's job to control the data consistency among all
  * workers. see details on \ref ServerHandle::Start
  */
-class KVStoreDist : public KVStoreLocal {
+class KVStoreDist : public KVStoreDevice {
  public:
-  KVStoreDist() : ps_worker_(nullptr), server_(nullptr) {
+  explicit KVStoreDist(bool device_mode)
+      : KVStoreDevice(device_mode),
+        ps_worker_(nullptr), server_(nullptr) {
     if (IsWorkerNode()) {
       ps_worker_ = new ps::KVWorker<real_t>(0);
       ps::Start("mxnet\0");
@@ -37,6 +39,7 @@ class KVStoreDist : public KVStoreLocal {
   virtual ~KVStoreDist() {
     Engine::Get()->WaitForAll();
     if (IsWorkerNode()) {
+      ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
       if (get_rank() == 0) {
         // stop the executor at servers
         SendCommandToServers(kStopServer, "");
@@ -112,11 +115,11 @@ class KVStoreDist : public KVStoreLocal {
       if (buf.is_none()) {
         buf = NDArray(vals[0]->shape(), pinned_ctx_);
       }
-      real_t* data = static_cast<real_t*>(buf.data().dptr_);
-      size_t size = buf.shape().Size();
 
-      auto pull_from_servers = [this, key, data, size](
+      auto pull_from_servers = [this, key, buf] (
           RunContext rctx, Engine::CallbackOnComplete cb) {
+        real_t* data = static_cast<real_t*>(buf.data().dptr_);
+        size_t size = buf.shape().Size();
         // convert to ps keys
         PSKV& pskv = EncodeKey(key, size);
 
@@ -133,10 +136,7 @@ class KVStoreDist : public KVStoreLocal {
           {buf.var()},
           FnProperty::kNormal, priority);
 
-      // copy data from buffer to vals
-      for (auto v : vals) {
-        CopyFromTo(buf, v);
-      }
+      ScatterPullValue(key, buf, vals, priority);
     }
   }
 
@@ -267,6 +267,8 @@ class KVStoreDist : public KVStoreLocal {
     return pskv;
   }
 
+  // whether use device distributed local sync.
+  bool device_mode_;
   /**
    * \brief for worker to push and pull data
    */
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index e897f6437256..3e6ab7b5b3b0 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -68,15 +68,11 @@ class KVStoreLocal : public KVStore {
       if (updater_ != nullptr || it == merge_buf_.end()) {
         auto it = local_.find(key);
         CHECK(it != local_.end()) << "key " << key << " has not been inited";
-        const NDArray& src = it->second;
-        for (auto* vptr : grouped_vals[i]) {
-          CopyFromTo(src, vptr, priority);
-        }
+        ScatterPullValue(
+            key, it->second, grouped_vals[i], priority);
       } else {
-        auto& src = it->second.merged;
-        for (auto* vptr : grouped_vals[i]) {
-          CopyFromTo(src, vptr, priority);
-        }
+        ScatterPullValue(
+            key, it->second.merged, grouped_vals[i], priority);
       }
     }
   }
@@ -88,6 +84,8 @@ class KVStoreLocal : public KVStore {
     Context ctx;
     // the merged value
     NDArray merged;
+    // the merged value on device
+    NDArray merged_device;
     /// \brief the cpu buffer for gpu data
     std::vector<NDArray> copy_buf;
     // allocate copy buffer, if it has not been allocated
@@ -169,6 +167,16 @@ class KVStoreLocal : public KVStore {
     return buf.merged;
   }
 
+  virtual void ScatterPullValue(
+      int key,
+      const NDArray& src,
+      const std::vector<NDArray*>& vals,
+      int priority) {
+    for (auto* vptr : vals) {
+      CopyFromTo(src, vptr, priority);
+    }
+  }
+
   /// \brief buffer for merging push value
   std::unordered_map<int, BufferEntry> merge_buf_;
   // pinned context
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index fd7c1aa283d4..e6da76d90329 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -60,10 +60,6 @@ class ActivationOp : public Operator {
     Tensor<xpu, 2, DType> data = in_data[activation::kData].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
     Assign(out, req[activation::kOut], F<ForwardOp>(data));
-    // Use asynchronize complete notification
-    // This is only intended as an example of async ops
-    if (s != NULL) s->Wait();
-    ctx.async_on_complete();
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -83,16 +79,6 @@ class ActivationOp : public Operator {
     Tensor<xpu, 2, DType> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> m_in_grad = in_grad[activation::kData].FlatTo2D<xpu, DType>(s);
     Assign(m_in_grad, req[activation::kData], F<BackwardOp>(m_out_data) * m_out_grad);
-    // Use asynchronize complete notification
-    // This is only intended as an example of async ops
-    if (s != NULL) s->Wait();
-    ctx.async_on_complete();
-  }
-
-  virtual ExecType exec_type() const {
-    // Use asynchronize complete notification
-    // This is only intended as an example of async ops
-    return kAsync;
   }
 };  // class ActivationOp
 
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index e8c5502d86af..03238b067ea3 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -88,6 +88,9 @@ class BatchNormOp : public Operator {
     Tensor<xpu, 1> bias = in_data[batchnorm::kBeta].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_mean = aux_states[batchnorm::kMovingMean].get<xpu, 1, real_t>(s);
     Tensor<xpu, 1> moving_var = aux_states[batchnorm::kMovingVar].get<xpu, 1, real_t>(s);
+
+    if (ctx.is_train && param_.fix_gamma) slope = 1.f;
+
     // whether use global statistics
     if (ctx.is_train && !param_.use_global_stats) {
       Tensor<xpu, 1> mean = out_data[batchnorm::kMean].get<xpu, 1, real_t>(s);
@@ -98,16 +101,10 @@ class BatchNormOp : public Operator {
       mean = scale * sumall_except_dim<1>(data);
       var = scale * sumall_except_dim<1>(F<mshadow_op::square>(
           data - broadcast<1>(mean, data.shape_)));
-      if (param_.fix_gamma) {
-        Assign(out, req[batchnorm::kOut], (data - broadcast<1>(mean, data.shape_)) /
-               F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
-               broadcast<1>(bias, out.shape_));
-      } else {
-        Assign(out, req[batchnorm::kOut], broadcast<1>(slope, out.shape_) *
-               (data - broadcast<1>(mean, data.shape_)) /
-               F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
-               broadcast<1>(bias, out.shape_));
-      }
+      Assign(out, req[batchnorm::kOut], broadcast<1>(slope, out.shape_) *
+             (data - broadcast<1>(mean, data.shape_)) /
+             F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_)) +
+             broadcast<1>(bias, out.shape_));
     } else {
       Assign(out, req[batchnorm::kOut], broadcast<1>(slope /
                                           F<mshadow_op::square_root>(moving_var + param_.eps),
@@ -183,19 +180,15 @@ class BatchNormOp : public Operator {
                sumall_except_dim<1>(
                    grad * (data - broadcast<1>(mean, data.shape_)) /
                    F<mshadow_op::square_root>(broadcast<1>(var + param_.eps, data.shape_))));
-        Assign(grad_in, req[batchnorm::kData],
-               (grad * broadcast<1>(slope, data.shape_)) *
-               broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
-               broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
-                                                                                     data.shape_)) +
-               broadcast<1>(gmean, data.shape_) * scale);
       } else {
-        Assign(grad_in, req[batchnorm::kData], grad *
-               broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
-               broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
-                                                                                     data.shape_)) +
-               broadcast<1>(gmean, data.shape_) * scale);
+        Assign(gslope, req[batchnorm::kGamma], 0.0f);
       }
+      Assign(grad_in, req[batchnorm::kData],
+             (grad * broadcast<1>(slope, data.shape_)) *
+             broadcast<1>(1.0f / F<mshadow_op::square_root>(var + param_.eps), data.shape_) +
+             broadcast<1>(gvar, data.shape_) * scale * 2.0f * (data - broadcast<1>(mean,
+                                                                                   data.shape_)) +
+             broadcast<1>(gmean, data.shape_) * scale);
       Assign(gbias, req[batchnorm::kBeta], sumall_except_dim<1>(grad));
     } else {
       // use global statistics with freeze moving mean and var.
@@ -204,14 +197,12 @@ class BatchNormOp : public Operator {
                sumall_except_dim<1>(
                    grad * (data - broadcast<1>(moving_mean, data.shape_)) /
                    F<mshadow_op::square_root>(broadcast<1>(moving_var + param_.eps, data.shape_))));
-        Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
-               broadcast<1>(
-                   1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
       } else {
-        Assign(grad_in, req[batchnorm::kData], grad *
-               broadcast<1>(
-                   1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
+        Assign(gslope, req[batchnorm::kGamma], 0.0f);
       }
+      Assign(grad_in, req[batchnorm::kData], (grad * broadcast<1>(slope, data.shape_)) *
+             broadcast<1>(
+                 1.0f / F<mshadow_op::square_root>(moving_var + param_.eps), data.shape_));
     }
   }
 
diff --git a/src/operator/block_grad-inl.h b/src/operator/block_grad-inl.h
index ff5262d4e04a..eaf39ce59ac4 100644
--- a/src/operator/block_grad-inl.h
+++ b/src/operator/block_grad-inl.h
@@ -24,7 +24,7 @@ enum BlockGradientOpInputs {kData};
 enum BlockGradientOpOutputs {kOut};
 }  // namespace blockgrad
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class BlockGradientOp : public Operator {
  public:
   virtual void Forward(const OpContext &ctx,
@@ -37,8 +37,8 @@ class BlockGradientOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[blockgrad::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> data = in_data[blockgrad::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[blockgrad::kOut].FlatTo2D<xpu, DType>(s);
     out = F<mshadow_op::identity>(data);
   }
 
@@ -52,13 +52,13 @@ class BlockGradientOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad = in_grad[blockgrad::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> grad = in_grad[blockgrad::kData].FlatTo2D<xpu, DType>(s);
     grad = 0.f;
   }
 };  // class BlockGradientOp
 
 template<typename xpu>
-Operator *CreateOp();
+Operator *CreateOp(int dtype);
 
 #if DMLC_USE_CXX11
 class BlockGradientProp : public OperatorProperty {
@@ -81,6 +81,17 @@ class BlockGradientProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     return new BlockGradientProp();
   }
@@ -102,7 +113,13 @@ class BlockGradientProp : public OperatorProperty {
     return {{in_data[blockgrad::kData], out_data[blockgrad::kOut]}};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 };  // class BlockGradientProperty
 
 #endif  // DMLC_USE_CXX11
diff --git a/src/operator/block_grad.cc b/src/operator/block_grad.cc
index 67256f79f268..764618f51622 100644
--- a/src/operator/block_grad.cc
+++ b/src/operator/block_grad.cc
@@ -9,12 +9,21 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>() {
-  return new BlockGradientOp<cpu>();
+Operator *CreateOp<cpu>(int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new BlockGradientOp<cpu, DType>();
+  });
+  return op;
 }
 
-Operator *BlockGradientProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp);
+Operator *BlockGradientProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                              std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, in_type->at(0));
 }
 
 MXNET_REGISTER_OP_PROPERTY(BlockGrad, BlockGradientProp)
diff --git a/src/operator/block_grad.cu b/src/operator/block_grad.cu
index 22707e940b7e..af5fc1660797 100644
--- a/src/operator/block_grad.cu
+++ b/src/operator/block_grad.cu
@@ -9,8 +9,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>() {
-  return new BlockGradientOp<gpu>();
+Operator *CreateOp<gpu>(int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new BlockGradientOp<gpu, DType>();
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/broadcast_reduce_op-inl.h b/src/operator/broadcast_reduce_op-inl.h
index f43bafbc16da..2457948c2d7f 100644
--- a/src/operator/broadcast_reduce_op-inl.h
+++ b/src/operator/broadcast_reduce_op-inl.h
@@ -9,6 +9,7 @@
 #include <mxnet/operator_util.h>
 #include <vector>
 #include "./mshadow_op.h"
+#include "./broadcast_reduce_op_common.h"
 
 #if defined(__CUDACC__)
 #define XPU gpu
@@ -21,10 +22,11 @@ namespace op {
 
 struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
   bool keepdims;
-  int axis;
+  TShape axis;
   DMLC_DECLARE_PARAMETER(ReduceAxisParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(-1).set_lower_bound(-1)
-      .describe("The axis to perform the reduction. axis=-1 means to reduce all dimensions");
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+      .describe("Same as Numpy. The axes to perform the reduction."
+                "If left empty, a global reduction will be performed.");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
       .describe("Same as Numpy. If keepdims is set to true, "
       "the axis which is reduced is left in the result as dimension with size one.");
@@ -32,13 +34,24 @@ struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
 };
 
 struct BroadcastAxisParam : public dmlc::Parameter<BroadcastAxisParam> {
-  int axis;
-  int size;
+  TShape axis;
+  TShape size;
   DMLC_DECLARE_PARAMETER(BroadcastAxisParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(0).set_lower_bound(0)
-      .describe("The target axis of broadcasting.");
-    DMLC_DECLARE_FIELD(size).set_default(0).set_lower_bound(1)
-      .describe("Size of the broadcasting axis.");
+    DMLC_DECLARE_FIELD(axis).set_default(TShape())
+      .describe("The axes to perform the broadcasting.");
+    DMLC_DECLARE_FIELD(size).set_default(TShape())
+      .describe("Target sizes of the broadcasting axes.");
+  }
+};
+
+struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(BroadcastToParam) {
+    DMLC_DECLARE_FIELD(shape).set_default(TShape())
+      .describe("The shape of the desired array."
+                " We can set the dim to zero if it's same as the original."
+                " E.g `A = broadcast_to(B, shape=(10, 0, 0))` "
+                "has the same meaning as `A = broadcast_axis(B, axis=0, size=10)`.");
   }
 };
 
@@ -46,26 +59,24 @@ inline TShape ReduceAxisShape(const TShape& ishape,
   const EnvArguments& env) {
   ReduceAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < static_cast<int>(ishape.ndim()) || -1 == param.axis) <<
-    "axis must be smaller than the source ndim or equal to -1! Received axis=" <<
-    param.axis << ", src_ndim=" << ishape.ndim();
-  if (param.axis == -1 || (1 == ishape.ndim())) {
-    if (param.keepdims) {
-      return TShape(ishape.ndim());
-    } else {
-      return TShape(1);
+  std::vector<index_t> axes = ParseAxes_(param.axis, ishape.ndim());
+  if (axes.size() == 0) {
+    for (index_t i = 0; i < ishape.ndim(); ++i) {
+      axes.push_back(i);
     }
   }
   std::vector<mshadow::index_t> shape;
   for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (static_cast<int>(i) == param.axis) {
-      if (param.keepdims) {
-        shape.push_back(1);
-      }
-    } else {
+    if (!std::binary_search(axes.begin(), axes.end(), i)) {
       shape.push_back(ishape[i]);
+    } else if (param.keepdims) {
+      shape.push_back(1);
     }
   }
+  // We need to treat the global reduction case specially to avoid an empty output TShape.
+  if (shape.size() == 0) {
+    shape.push_back(1);
+  }
   return TShape(shape.begin(), shape.end());
 }
 
@@ -73,20 +84,32 @@ inline TShape BroadcastAxisShape(const TShape& ishape,
   const EnvArguments& env) {
   BroadcastAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < static_cast<int>(ishape.ndim())) <<
-    "axis must be smaller than the source ndim" << param.axis << ", src_ndim=" << ishape.ndim();
-  CHECK_EQ(ishape[param.axis], 1) <<
-    "Size of the broadcasting axis in the source must be 1, axis=" << param.axis
-    << ", size=" << ishape[param.axis];
-  std::vector<mshadow::index_t> shape;
-  for (index_t i = 0; i < ishape.ndim(); ++i) {
-    if (static_cast<int>(i) != param.axis) {
-      shape.push_back(ishape[i]);
-    } else {
-      shape.push_back(param.size);
+  CHECK_EQ(param.axis.ndim(), param.size.ndim());
+  TShape ret = ishape;
+  for (index_t i = 0; i < param.axis.ndim(); i++) {
+    CHECK_EQ(ishape[param.axis[i]], 1) <<
+      "Size of the broadcasting axis in the source must be 1, axis=" << param.axis
+      << ", size=" << param.size;
+    ret[param.axis[i]] = param.size[i];
+  }
+  return ret;
+}
+
+inline TShape BroadcastToShape(const TShape& ishape,
+  const EnvArguments& env) {
+  BroadcastToParam param;
+  param.Init(env.kwargs);
+  CHECK_EQ(param.shape.ndim(), ishape.ndim());
+  TShape ret = ishape;
+  for (index_t i = 0; i < param.shape.ndim(); i++) {
+    if (param.shape[i] > 0 && (param.shape[i] != ishape[i])) {
+      CHECK_EQ(ishape[i], 1) <<
+        "Size of the broadcasting axis in the source must be 1, src_shape=" << ishape
+        << ", broadcast_to=" << param.shape;
+      ret[i] = param.shape[i];
     }
   }
-  return TShape(shape.begin(), shape.end());
+  return ret;
 }
 
 // return a shape of scalar
@@ -103,47 +126,17 @@ void L2Norm(const TBlob &src,
             OpReqType req,
             RunContext ctx) {
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
-  mshadow::Tensor<xpu, 1> in =
-      src.get_with_shape<xpu, 1, real_t>(mshadow::Shape1(src.shape_.Size()), s);
-  mshadow::VectorDot(out, in, in);
-  out = mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out);
-}
-
-template<typename xpu, typename Reducer>
-void Reduce(const TBlob &src,
-            const EnvArguments& env,
-            TBlob *ret,
-            OpReqType req,
-            RunContext ctx) {
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  mshadow::Tensor<xpu, 1> out = ret->get<xpu, 1, real_t>(s);
-  mshadow::Tensor<xpu, 2> in =
-      src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(1, src.shape_.Size()), s);
-  out = mshadow::expr::reduce_except_dim<0, Reducer>(in);
-}
-
-// backward function that takes input value of the op
-template<typename xpu>
-void SumBackward_(const OutputGrad& scale,
-                  const EnvArguments& env,
-                  TBlob *in_grad,
-                  OpReqType req,
-                  RunContext ctx) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(in_grad->type_flag_, scale.data.type_flag_)
-    << "Unary function only support input/output with the same type";
-  MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
-      mshadow::Tensor<xpu, 1, DType> mscale = scale.data.get<xpu, 1, DType>(s);
-      mshadow::Tensor<xpu, 2, DType> igrad = in_grad->FlatTo2D<xpu, DType>(s);
-      ASSIGN_DISPATCH(igrad, req,
-                      broadcast_scalar(mscale, igrad.shape_));
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+    mshadow::Tensor<xpu, 1, DType> out = ret->get<xpu, 1, DType>(s);
+    mshadow::Tensor<xpu, 1, DType> in =
+      src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(src.shape_.Size()), s);
+    mshadow::VectorDot(out, in, in);
+    ASSIGN_DISPATCH(out, req, mshadow::expr::F<mxnet::op::mshadow_op::square_root>(out));
   });
 }
 
-template<typename xpu, typename Reducer, bool get_mask>
+template<typename xpu, typename Reducer>
 void ReduceChannel(const TBlob &src,
                    const EnvArguments& env,
                    TBlob *ret,
@@ -153,13 +146,17 @@ void ReduceChannel(const TBlob &src,
   using namespace mshadow;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 2> out = ret->get_with_shape<xpu, 2, real_t>(
-    Shape2(src.shape_[0], src.Size()/src.shape_[0]/src.shape_[1]),
-    s);
-  Tensor<xpu, 3> in = src.get_with_shape<xpu, 3, real_t>(
-    Shape3(src.shape_[0], src.shape_[1], src.Size()/src.shape_[0]/src.shape_[1]),
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+    Tensor<xpu, 2, DType> out = ret->get_with_shape<xpu, 2, DType>(
+    Shape2(src.shape_[0], src.Size() / src.shape_[0] / src.shape_[1]),
     s);
-  out = reduce_with_axis<Reducer, get_mask>(in, 1);
+    Tensor<xpu, 3, DType> in = src.get_with_shape<xpu, 3, DType>(
+      Shape3(src.shape_[0], src.shape_[1], src.Size() / src.shape_[0] / src.shape_[1]),
+      s);
+    CHECK(req != kAddTo) << "AddTo is not supported";
+    ASSIGN_DISPATCH(out, req, (reduce_with_axis<Reducer, true>(in, 1)));
+  });
 }
 
 // return a shape of ReduceChannel output
@@ -174,39 +171,49 @@ inline TShape ReduceChannelShape(const TShape& ishape,
 }
 
 // Reduce the given axis
-template<typename xpu, typename Reducer, bool get_mask>
+template<typename xpu, typename Reducer>
 void ReduceAxisImpl_(const TBlob &src,
-  const EnvArguments& env,
-  TBlob *ret,
-  OpReqType req,
-  RunContext ctx,
-  int axis,
-  bool keepdims) {
+                     const EnvArguments& env,
+                     TBlob *ret,
+                     OpReqType req,
+                     RunContext ctx,
+                     TShape axes) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (-1 == axis) {
-    // Reduce all dimensions if axis == -1
-    mshadow::Tensor<xpu, 2> in =
-      src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(1, src.shape_.Size()), s);
-    mshadow::Tensor<xpu, 1> out =
-      ret->get_with_shape<xpu, 1, real_t>(mshadow::Shape1(ret->shape_.Size()), s);
-    out = mshadow::expr::reduce_except_dim<0, Reducer>(in);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  // If the axes is empty, we just need to give an identity mapping.
+  if (axes.ndim() == 0) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 2, DType> in = src.FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      ASSIGN_DISPATCH(out, req, F<mshadow_op::identity>(in));
+    });
     return;
   }
-  int trailing = 1;
-  int leading = 1;
-  for (int i = 0; i < src.shape_.ndim(); ++i) {
-    if (i < axis) {
-      leading *= src.shape_[i];
-    } else if (i > axis) {
-      trailing *= src.shape_[i];
+  bool is_contiguous_axes;
+  index_t reducing_size;
+  CheckContiguousAxes_(&is_contiguous_axes, &reducing_size, axes, src.shape_);
+  if (is_contiguous_axes) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 3, DType> in = src.FlatTo3D<xpu, DType>(axes[0], axes[axes.ndim() - 1], s);
+      Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(ret->Size()), s);
+      ReduceAxesAssign<Reducer>(out, req, TShape(1), in);
+    });
+  } else {
+    Shape<MXNET_SPECIAL_MAX_NDIM> padded_shape_;
+    for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      padded_shape_[i] = (i < src.ndim()) ? src.shape_[i] : 1;
     }
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> in =
+        src.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(padded_shape_, s);
+      Tensor<xpu, 1, DType> out =
+        ret->get_with_shape<xpu, 1, DType>(mshadow::Shape1(ret->Size()), s);
+      ReduceAxesAssign<Reducer>(out, req, axes, in);
+    });
   }
-  mshadow::Tensor<xpu, 3> in =
-    src.get_with_shape<xpu, 3, real_t>(mshadow::Shape3(leading, src.shape_[axis], trailing), s);
-  mshadow::Tensor<xpu, 2> out =
-    ret->get_with_shape<xpu, 2, real_t>(mshadow::Shape2(leading, trailing), s);
-  out = mshadow::expr::reduce_with_axis<Reducer, get_mask>(in, 1);
 }
 
 // Broadcast the given axis to the given broadcasting size
@@ -216,39 +223,55 @@ void BroadcastAxisImpl_(const TBlob &src,
   TBlob *ret,
   OpReqType req,
   RunContext ctx,
-  int axis,
-  int bsize,
-  bool keepdims) {
+  const TShape &axes,
+  const TShape &bsizes) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (axis == -1) {
-    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-      mshadow::Tensor<xpu, 1, DType> in =
-        src.get_with_shape<xpu, 1, DType>(mshadow::Shape1(src.shape_.Size()), s);
-      mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
-      ASSIGN_DISPATCH(out, req,
-        broadcast_scalar(in, out.shape_));
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  // If the axes is empty, we just need to give an identity mapping.
+  if (axes.ndim() == 0) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 2, DType> in = src.FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      ASSIGN_DISPATCH(out, req, F<mshadow_op::identity>(in));
     });
     return;
   }
-  int trailing = 1;
-  int leading = 1;
-  for (int i = 0; i < ret->shape_.ndim(); ++i) {
-    if (i < axis) {
-      leading *= ret->shape_[i];
-    } else if (i > axis) {
-      trailing *= ret->shape_[i];
+  bool is_contiguous_axes;
+  index_t broadcasting_size;
+  CheckContiguousAxes_(&is_contiguous_axes, &broadcasting_size, axes, ret->shape_);
+  if (is_contiguous_axes) {
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, 3, DType> out = ret->FlatTo3D<xpu, DType>(axes[0], axes[axes.ndim() - 1], s);
+      Tensor<xpu, 3, DType> in =
+        src.get_with_shape<xpu, 3, DType>(Shape3(out.shape_[0], 1, out.shape_[2]), s);
+      ASSIGN_DISPATCH(out, req, broadcast_keepdim(in, 1, broadcasting_size));
+    });
+  } else {
+    CHECK(ret->ndim() <= MXNET_SPECIAL_MAX_NDIM) << "non-contiguous axis supports ndim up to "
+                                                 << MXNET_SPECIAL_MAX_NDIM;
+    Shape<MXNET_SPECIAL_MAX_NDIM> padded_src_shape_;
+    Shape<MXNET_SPECIAL_MAX_NDIM> padded_ret_shape_;
+    for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; ++i) {
+      padded_ret_shape_[i] = (i < ret->ndim()) ? ret->shape_[i] : 1;
+    }
+    padded_src_shape_ = padded_ret_shape_;
+    for (index_t i = 0; i < axes.ndim(); ++i) {
+      padded_src_shape_[axes[i]] = 1;
     }
+    MSHADOW_REAL_TYPE_SWITCH(src.type_flag_, DType, {
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> in =
+        src.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(padded_src_shape_, s);
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> out =
+        ret->get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(padded_ret_shape_, s);
+      ASSIGN_DISPATCH(out, req, broadcast_multi_axes(in, axes, bsizes));
+    });
   }
-  mshadow::Tensor<xpu, 2> in =
-    src.get_with_shape<xpu, 2, real_t>(mshadow::Shape2(leading, trailing), s);
-  mshadow::Tensor<xpu, 3> out =
-    ret->get_with_shape<xpu, 3, real_t>(mshadow::Shape3(leading, bsize, trailing), s);
-  out = mshadow::expr::broadcast_with_axis(in, 0, bsize);
 }
 
 // Forward pass of reduce over the given axis
-template<typename xpu, typename Reducer, bool get_mask>
+template<typename xpu, typename Reducer>
 void ReduceAxis(const TBlob &src,
   const EnvArguments& env,
   TBlob *ret,
@@ -257,10 +280,14 @@ void ReduceAxis(const TBlob &src,
   using namespace mshadow::expr;
   ReduceAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < src.shape_.ndim() || -1 == param.axis) <<
-    "axis must be smaller than the source ndim or equals to -1!"
-    " Received axis=" << param.axis << ", src_ndim=" << src.shape_.ndim();
-  ReduceAxisImpl_<xpu, Reducer, get_mask>(src, env, ret, req, ctx, param.axis, param.keepdims);
+  std::vector<index_t> axes = ParseAxes_(param.axis, src.ndim());
+  if (axes.size() == 0) {
+    for (index_t i = 0; i < src.ndim(); i++) {
+      axes.push_back(i);
+    }
+  }
+  ReduceAxisImpl_<xpu, Reducer>(src, env, ret, req, ctx,
+                                TShape(axes.begin(), axes.end()));
 }
 
 // Backward pass of reduce over the given axis
@@ -274,17 +301,18 @@ void SumAxisGrad_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   ReduceAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < in_grad->shape_.ndim() || param.axis == -1) <<
-    "axis must be smaller than the input grad ndim or equals to -1."
-    " Received axis=" << param.axis << ", igrad_ndim=" << in_grad->shape_.ndim();
-  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
-    << "Unary function only support input/output with the same type";
-  if (-1 == param.axis) {
-    BroadcastAxisImpl_<xpu>(out_grad.data, env, in_grad, req, ctx, param.axis, 0, param.keepdims);
-  } else {
-    BroadcastAxisImpl_<xpu>(out_grad.data, env, in_grad, req, ctx, param.axis,
-      in_grad->shape_[param.axis], param.keepdims);
+  std::vector<index_t> axes = ParseAxes_(param.axis, in_grad->ndim());
+  if (axes.size() == 0) {
+    for (index_t i = 0; i < in_grad->ndim(); i++) {
+      axes.push_back(i);
+    }
   }
+  std::vector<size_t> bsizes;
+  for (std::vector<index_t>::iterator it = axes.begin(); it != axes.end(); ++it) {
+    bsizes.push_back(in_grad->shape_[*it]);
+  }
+  BroadcastAxisImpl_<xpu>(out_grad.data, env, in_grad, req, ctx,
+                          TShape(axes.begin(), axes.end()), TShape(bsizes.begin(), bsizes.end()));
 }
 
 // Forward pass of broadcast over the given axis
@@ -297,13 +325,13 @@ void BroadcastAxis(const TBlob &src,
   using namespace mshadow::expr;
   BroadcastAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < src.shape_.ndim()) <<
-    "axis must be smaller than the source ndim" << param.axis <<
-    ", src_ndim=" << src.shape_.ndim();
-  CHECK_EQ(src.shape_[param.axis], 1) <<
-    "Size of the broadcasting axis in the source must be 1, "
-    "axis=" << param.axis << ", size=" << src.shape_[param.axis];
-  BroadcastAxisImpl_<xpu>(src, env, ret, req, ctx, param.axis, param.size, true);
+  std::vector<index_t> axes = ParseAxes_(param.axis, src.ndim());
+  std::vector<size_t> bsizes;
+  for (std::vector<index_t>::iterator it = axes.begin(); it != axes.end(); ++it) {
+    bsizes.push_back(ret->shape_[*it]);
+  }
+  BroadcastAxisImpl_<xpu>(src, env, ret, req, ctx,
+                          TShape(axes.begin(), axes.end()), TShape(bsizes.begin(), bsizes.end()));
 }
 
 // Backward pass of broadcast over the given axis
@@ -317,18 +345,49 @@ void BroadcastAxisGrad_(const OutputGrad& out_grad,
   using namespace mshadow::expr;
   BroadcastAxisParam param;
   param.Init(env.kwargs);
-  CHECK(param.axis < in_grad->shape_.ndim()) <<
-    "axis must be smaller than the source ndim" << param.axis <<
-    ", src_ndim=" << in_grad->shape_.ndim();
-  CHECK_EQ(in_grad->shape_[param.axis], 1) <<
-    "Size of the broadcasting axis in the source must be 1, "
-    "axis=" << param.axis << ", size=" << in_grad->shape_[param.axis];
-  CHECK_EQ(in_grad->type_flag_, out_grad.data.type_flag_)
-    << "Unary function only support input/output with the same type";
-  ReduceAxisImpl_<xpu, mshadow::red::sum, false>(out_grad.data, env, in_grad, req, ctx,
-                                                 param.axis, true);
+  std::vector<index_t> axes = ParseAxes_(param.axis, in_grad->ndim());
+  ReduceAxisImpl_<xpu, mshadow::red::sum>(out_grad.data, env, in_grad, req, ctx,
+                                          TShape(axes.begin(), axes.end()));
 }
 
+// Forward pass of broadcast_to
+template<typename xpu>
+void BroadcastTo(const TBlob &src,
+  const EnvArguments& env,
+  TBlob *ret,
+  OpReqType req,
+  RunContext ctx) {
+  using namespace mshadow::expr;
+  std::vector<index_t> axes;
+  std::vector<size_t> bsizes;
+  for (index_t i = 0; i < src.shape_.ndim(); ++i) {
+    if (src.shape_[i] != ret->shape_[i]) {
+      axes.push_back(i);
+      bsizes.push_back(ret->shape_[i]);
+    }
+  }
+  BroadcastAxisImpl_<xpu>(src, env, ret, req, ctx,
+                          TShape(axes.begin(), axes.end()), TShape(bsizes.begin(), bsizes.end()));
+}
+
+// Backward pass of broadcast_to
+template<typename xpu>
+void BroadcastToGrad_(const OutputGrad& out_grad,
+  const EnvArguments& env,
+  TBlob *in_grad,
+  OpReqType req,
+  RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  std::vector<index_t> axes;
+  for (index_t i = 0; i < in_grad->shape_.ndim(); ++i) {
+    if (out_grad.data.shape_[i] != in_grad->shape_[i]) {
+      axes.push_back(i);
+    }
+  }
+  ReduceAxisImpl_<xpu, mshadow::red::sum>(out_grad.data, env, in_grad, req, ctx,
+                                          TShape(axes.begin(), axes.end()));
+}
 
 
 // L2 norm
@@ -337,56 +396,69 @@ MXNET_REGISTER_SIMPLE_OP(norm, XPU)
 .set_shape_function(ScalarShape)
 .describe("Take L2 norm of the src."
           "The result will be ndarray of shape (1,) on the same device.");
+
 // Max
 MXNET_REGISTER_SIMPLE_OP(max, XPU)
-.set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::maximum>, kNoInplace, kNotRegisterSymbolic)
-.set_shape_function(ScalarShape)
-.describe("(Deprecated! Use max_axis instead.) Take max of the src."
-          "The result will be ndarray of shape (1,) on the same device.");
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum>,
+kNoInplace, kNotRegisterSymbolic)
+.set_shape_function(ReduceAxisShape)
+.describe("Take max of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
+
 // Min
 MXNET_REGISTER_SIMPLE_OP(min, XPU)
-.set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::minimum>, kNoInplace, kNotRegisterSymbolic)
-.set_shape_function(ScalarShape)
-.describe("(Deprecated! Use min_axis instead.) Take min of the src."
-          "The result will be ndarray of shape (1,) on the same device.");
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum>,
+kNoInplace, kNotRegisterSymbolic)
+.set_shape_function(ReduceAxisShape)
+.describe("Take min of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
+
 // Sum
 MXNET_REGISTER_SIMPLE_OP(sum, XPU)
-.set_function(XPU::kDevMask, Reduce<XPU, mshadow::red::sum>, kNoInplace, kRegisterSymbolic)
-.set_shape_function(ScalarShape)
-.set_gradient(XPU::kDevMask, SumBackward_<XPU>, kNoInplace)
-.describe("(Deprecated! Use sum_axis instead.) Take sum of the src."
-          "The result will be ndarray of shape (1,) on the same device.");
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::sum>,
+kNoInplace, kRegisterSymbolic)
+.set_shape_function(ReduceAxisShape)
+.set_gradient(XPU::kDevMask, SumAxisGrad_<XPU>, kNoInplace)
+.describe("Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
+
 // max_axis
 MXNET_REGISTER_SIMPLE_OP(max_axis, XPU)
 .set_enable_kwargs(true)
-.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum, false>,
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::maximum>,
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
-.describe("Take max of the src in the given axis. axis=-1 means to reduce all the dimensions."
-"The keepdims option has the same meaning as Numpy.");
+.describe("(Depreciated! Use max instead!)"
+          " Take max of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // min_axis
 MXNET_REGISTER_SIMPLE_OP(min_axis, XPU)
 .set_enable_kwargs(true)
-.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum, false>,
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::minimum>,
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
-.describe("Take min of the src in the given axis. axis=-1 means to reduce all the dimensions."
-"The keepdims option has the same meaning as Numpy.");
+.describe("(Depreciated! Use min instead!)"
+          " Take min of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // sum_axis
 MXNET_REGISTER_SIMPLE_OP(sum_axis, XPU)
 .set_enable_kwargs(true)
-.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::sum, false>,
+.set_function(XPU::kDevMask, ReduceAxis<XPU, mshadow::red::sum>,
               kNoInplace, kRegisterSymbolic)
 .set_shape_function(ReduceAxisShape)
 .set_gradient(XPU::kDevMask, SumAxisGrad_<XPU>, kNoInplace)
-.describe("Take sum of the src in the given axis. axis=-1 means to reduce all the dimensions."
-"The keepdims option has the same meaning as Numpy.");
+.describe("(Depreciated! Use sum instead!)"
+          " Take sum of the src in the given axis and returns a NDArray. Follows numpy semantics.")
+.add_arguments(ReduceAxisParam::__FIELDS__());
 
 // argmax channel
 MXNET_REGISTER_SIMPLE_OP(argmax_channel, XPU)
-.set_function(XPU::kDevMask, ReduceChannel<XPU, mshadow::red::maximum, true>,
+.set_function(XPU::kDevMask, ReduceChannel<XPU, mshadow::red::maximum>,
               kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(ReduceChannelShape)
 .describe("Take argmax indices of each channel of the src."
@@ -400,7 +472,19 @@ MXNET_REGISTER_SIMPLE_OP(broadcast_axis, XPU)
 .set_shape_function(BroadcastAxisShape)
 .set_gradient(XPU::kDevMask, BroadcastAxisGrad_<XPU>, kNoInplace)
 .describe("Broadcast data in the given axis to the given size. "
-"The original size of the broadcasting axis must be 1.");
+          "The original size of the broadcasting axis must be 1.")
+.add_arguments(BroadcastAxisParam::__FIELDS__());
+
+// broadcast_to
+MXNET_REGISTER_SIMPLE_OP(broadcast_to, XPU)
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, BroadcastTo<XPU>,
+kNoInplace, kRegisterSymbolic)
+.set_shape_function(BroadcastToShape)
+.set_gradient(XPU::kDevMask, BroadcastToGrad_<XPU>, kNoInplace)
+.describe("Broadcast data to the target shape. "
+          "The original size of the broadcasting axis must be 1.")
+.add_arguments(BroadcastToParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/broadcast_reduce_op.cc b/src/operator/broadcast_reduce_op.cc
index 213aabb4b033..5c731f0284bd 100644
--- a/src/operator/broadcast_reduce_op.cc
+++ b/src/operator/broadcast_reduce_op.cc
@@ -7,7 +7,10 @@
 #include "./broadcast_reduce_op-inl.h"
 namespace mxnet {
 namespace op {
-  DMLC_REGISTER_PARAMETER(ReduceAxisParam);
-  DMLC_REGISTER_PARAMETER(BroadcastAxisParam);
+
+DMLC_REGISTER_PARAMETER(ReduceAxisParam);
+DMLC_REGISTER_PARAMETER(BroadcastAxisParam);
+DMLC_REGISTER_PARAMETER(BroadcastToParam);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/broadcast_reduce_op_common.h b/src/operator/broadcast_reduce_op_common.h
new file mode 100644
index 000000000000..179935d7a882
--- /dev/null
+++ b/src/operator/broadcast_reduce_op_common.h
@@ -0,0 +1,186 @@
+/*!
+* Copyright (c) 2016 by Contributors
+* \file broadcast_reduce_op_common.h
+* \brief common function used for broadcasting and reducing
+* \author Xingjian Shi
+*/
+#ifndef MXNET_OPERATOR_BROADCAST_REDUCE_OP_COMMON_H_
+#define MXNET_OPERATOR_BROADCAST_REDUCE_OP_COMMON_H_
+#include <dmlc/logging.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <set>
+
+namespace mxnet {
+namespace op {
+/*!
+* \brief Sort the given axes and removes the duplicate keys to get a vector
+* \param param_axis the input axis
+* \param max_ndim the maximum ndim
+*/
+inline std::vector<index_t> ParseAxes_(const TShape& param_axis, index_t max_ndim) {
+  std::set<index_t> axes_set_;
+  std::vector<index_t> axes;
+  for (index_t i = 0; i < param_axis.ndim(); i++) {
+    CHECK(param_axis[i] < max_ndim) << "axes must be within the range, ndim of the source="
+      << max_ndim << "axis=" << param_axis;
+    CHECK_EQ(axes_set_.find(param_axis[i]), axes_set_.end())
+      << "Duplicate value in 'axis', received:" << param_axis;
+    axes_set_.insert(param_axis[i]);
+  }
+  for (std::set<index_t>::iterator it = axes_set_.begin(); it != axes_set_.end(); ++it) {
+    axes.push_back(*it);
+  }
+  return axes;
+}
+
+/*!
+* \brief Check if the axes are continuous + get reducing size. E.g (1, 3) -> false, (1,2,3) -> true
+* \param is_contiguous_axes whether the axes is contiguous
+* \param reducing_size product of source shape in the given axes
+* \param axes
+* \param src_shape shape of the source tensor
+*/
+inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
+  const mshadow::TShape &axes, const mshadow::TShape &src_shape) {
+  *is_contiguous_axes = true;
+  *reducing_size = 1;
+  for (index_t i = 0; i < axes.ndim(); ++i) {
+    *reducing_size *= src_shape[axes[i]];
+    if (i > 0) {
+      *is_contiguous_axes = *is_contiguous_axes && (axes[i] == (axes[i - 1] + 1));
+      CHECK(axes[i - 1] < axes[i]) << "axes must be in increasing order, received axes=" << axes;
+    }
+  }
+}
+
+template<int dimsrc>
+inline void CheckContiguousAxes_(bool *is_contiguous_axes, index_t *reducing_size,
+  const mshadow::TShape &axes, const mshadow::Shape<dimsrc> &src_shape) {
+  CheckContiguousAxes_(is_contiguous_axes, reducing_size, axes,
+    TShape(src_shape.shape_, src_shape.shape_ + dimsrc));
+}
+
+inline TShape GetBroadcastingAxes_(const mshadow::TShape &src_shape,
+  const mshadow::TShape &target_shape) {
+  std::vector<index_t> axes_vec;
+  CHECK_EQ(target_shape.ndim(), src_shape.ndim());
+  for (index_t i = 0; i < src_shape.ndim(); ++i) {
+    if (src_shape[i] != target_shape[i]) {
+      CHECK_EQ(src_shape[i], 1) << "broadcastsing axis must have size 1, received src_shape="
+        << src_shape << " target_shape=" << target_shape;
+      axes_vec.push_back(i);
+    }
+  }
+  TShape axes = TShape(axes_vec.begin(), axes_vec.end());
+  return axes;
+}
+
+/*!
+* \brief a reduce over multiple axes and assign to the output tensor.
+* \param out output tensor, must have dim 1
+* \param src the source expression
+* \param axes the given axes, should be in increasing order
+* \tparam Reducer type of the reducing operation
+* \tparam xpu
+* \tparam SrcExp the src expression template
+* \tparam etype type of expression
+*/
+template<typename Reducer, typename xpu, typename SrcExp, typename DType>
+void ReduceAxesAssign(mshadow::Tensor<xpu, 1, DType> out, const OpReqType req,
+  const TShape &axes, const SrcExp &src_) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  static const int dimsrc = ExpInfo<SrcExp>::kDim;
+  CHECK(axes.ndim() <= dimsrc);
+  Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+
+  // 1. Check if the axes has size 0, if so, no reducing is needed.
+  if (0 == axes.ndim()) {
+    ASSIGN_DISPATCH(out, req, reshape(src_, Shape1(src_shape.ProdShape(0, dimsrc))));
+    return;
+  }
+
+  // 2. Check if we want to reduce over contiguous axes and get the reducing size.
+  //  e.g. (1,2,3) --> contiguous, (1,3) --> noncontiguous
+  bool is_contiguous_axes = true;
+  index_t reducing_size = 1;
+  CheckContiguousAxes_(&is_contiguous_axes, &reducing_size, axes, src_shape);
+
+  // 3. For contiguous axes, we can always reshape them to (leading, reducing_size, trailing)
+  //  and we can then simplify the combination of mshadow symbols.
+  if (is_contiguous_axes) {
+    index_t leading = 1;
+    index_t trailing = 1;
+    for (index_t i = 0; i < dimsrc; ++i) {
+      if (i < axes[0]) {
+        leading *= src_shape[i];
+      } else if (i > axes[axes.ndim() - 1]) {
+        trailing *= src_shape[i];
+      }
+    }
+    if (1 == leading) {
+      ASSIGN_DISPATCH(out, req,
+        (reduce_except_dim<1, Reducer>(reshape(src_, Shape2(reducing_size, trailing)))));
+    } else {
+      ASSIGN_DISPATCH(out, req, (reduce_except_dim<1, Reducer>(
+        reshape(swapaxis<1, 0>(reshape(src_, Shape3(leading, reducing_size, trailing))),
+        Shape2(reducing_size, leading * trailing)))));
+    }
+    return;
+  }
+  // 4. For non-contiguous axes, we need to push axes to the front of the shape vector then reduce.
+  //   E.g axes = (1, 2), dim = 6 => transpose_shape = (1, 2, 0, 3, 4, 5)
+  Shape<dimsrc> transpose_shape = src_shape;
+  index_t remaining_size = 1;
+  for (index_t i = 0; i < axes.ndim(); ++i) {
+    transpose_shape[i] = axes[i];
+    if (i > 0) {
+      for (index_t j = axes[i - 1] + 1; j < axes[i]; ++j) {
+        transpose_shape[axes.ndim() - i + j] = j;
+        remaining_size *= src_shape[j];
+      }
+    }
+    if (axes.ndim() - 1 == i) {
+      for (index_t j = axes[axes.ndim() - 1] + 1; j < dimsrc; ++j) {
+        transpose_shape[j] = j;
+        remaining_size *= src_shape[j];
+      }
+    }
+    if (0 == i) {
+      for (index_t j = 0; j < axes[0]; ++j) {
+        transpose_shape[axes.ndim() - i + j] = j;
+        remaining_size *= src_shape[j];
+      }
+    }
+  }
+  ASSIGN_DISPATCH(out, req,
+    (reduce_except_dim<1, Reducer>(reshape(transpose(src_, transpose_shape),
+    Shape2(reducing_size, remaining_size)))));
+}
+
+/*!
+* \brief a reduce to the given shape and assign to the output tensor.
+* \param out output tensor, must have dim 1
+* \param src the source expression
+* \param target_shape shape of the target tensor, must have size 1 for the reduction axes
+* \tparam Reducer type of the reducing operation
+* \tparam xpu
+* \tparam SrcExp the src expression template
+* \tparam etype type of expression
+*/
+template<typename Reducer, typename xpu, typename SrcExp, typename DType>
+void ReduceToAssign(mshadow::Tensor<xpu, 1, DType> out, const OpReqType req,
+  const TShape &target_shape, const SrcExp &src_) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  static const int dimsrc = ExpInfo<SrcExp>::kDim;
+  Shape<dimsrc> src_shape = ShapeCheck<dimsrc, SrcExp>::Check(src_);
+  TShape axes = GetBroadcastingAxes_(target_shape,
+    TShape(src_shape.shape_, src_shape.shape_ + dimsrc));
+  ReduceAxesAssign<Reducer>(out, req, axes, src_);
+}
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_BROADCAST_REDUCE_OP_COMMON_H_
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 249b07b54632..9ae6a6602c2e 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -14,15 +14,15 @@
 namespace mxnet {
 namespace op {
 
-template<typename xpu, int dim, int cdim>
-inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim> > &input,
-                               mshadow::Tensor<xpu, dim> *output, const int dimension,
+template<typename xpu, int dim, int cdim, typename DType>
+inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim, DType> > &input,
+                               mshadow::Tensor<xpu, dim, DType> *output, const int dimension,
                                const OpReqType req) {
   using mshadow::expr::concat;
   using mshadow::expr::slice;
 
   if (dimension == cdim) {
-    mshadow::Tensor<xpu, dim> out = *output;
+    mshadow::Tensor<xpu, dim, DType> out = *output;
     size_t size = input.size();
     index_t begin = 0;
     for (index_t i = 0; i < size; ++i) {
@@ -35,9 +35,9 @@ inline void concatenate_helper(const std::vector<mshadow::Tensor<xpu, dim> > &in
   }
 }
 
-template<typename xpu, int dim>
-inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim> > &input,
-                        mshadow::Tensor<xpu, dim> *output, const int dimension,
+template<typename xpu, int dim, typename DType>
+inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim, DType> > &input,
+                        mshadow::Tensor<xpu, dim, DType> *output, const int dimension,
                         const OpReqType req) {
   if (dimension < 0) {
     LOG(FATAL) << "dimension (" << dimension << ") must be greater than 0";
@@ -49,15 +49,15 @@ inline void Concatenate(const std::vector<mshadow::Tensor<xpu, dim> > &input,
 }
 
 
-template<typename xpu, int dim, int cdim>
-void split_helper(const mshadow::Tensor<xpu, dim> &input,
-           std::vector<mshadow::Tensor<xpu, dim> > *output,
+template<typename xpu, int dim, int cdim, typename DType>
+void split_helper(const mshadow::Tensor<xpu, dim, DType> &input,
+           std::vector<mshadow::Tensor<xpu, dim, DType> > *output,
            const int dimension, const std::vector<OpReqType> &req) {
   using mshadow::expr::concat;
   using mshadow::expr::slice;
 
   if (dimension == cdim) {
-    std::vector<mshadow::Tensor<xpu, dim> > out = *output;
+    std::vector<mshadow::Tensor<xpu, dim, DType> > out = *output;
     size_t size = out.size();
     index_t begin = 0;
     for (index_t i = 0; i < size; ++i) {
@@ -70,9 +70,9 @@ void split_helper(const mshadow::Tensor<xpu, dim> &input,
   }
 }
 
-template<typename xpu, int dim>
-void Split(const mshadow::Tensor<xpu, dim> &input,
-           std::vector<mshadow::Tensor<xpu, dim> > *output,
+template<typename xpu, int dim, typename DType>
+void Split(const mshadow::Tensor<xpu, dim, DType> &input,
+           std::vector<mshadow::Tensor<xpu, dim, DType> > *output,
            const int dimension, const std::vector<OpReqType> &req) {
   if (dimension < 0) {
     LOG(FATAL) << "dimension (" << dimension << ") must be greater than 0";
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index 5ece2bbfe9df..f8de862ea3cb 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -36,7 +36,7 @@ struct ConcatParam : public dmlc::Parameter<ConcatParam> {
   }
 };  // struct ConcatParam
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ConcatOp : public Operator {
  public:
   explicit ConcatOp(ConcatParam param)
@@ -53,8 +53,8 @@ class ConcatOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     CHECK_LT(dimension_, in_data[concat_enum::kData0].ndim());
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3> > data(size_);
-    Tensor<xpu, 3> out;
+    std::vector<Tensor<xpu, 3, DType> > data(size_);
+    Tensor<xpu, 3, DType> out;
     size_t leading = 1, trailing = 1;
     for (int i = 0; i < dimension_; ++i) {
       leading *= out_data[concat_enum::kOut].shape_[i];
@@ -64,11 +64,11 @@ class ConcatOp : public Operator {
     }
     size_t mid = out_data[concat_enum::kOut].shape_[dimension_];
     Shape<3> oshape = Shape3(leading, mid, trailing);
-    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, real_t>(oshape, s);
+    out = out_data[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
 
     for (int i = 0; i < size_; ++i) {
       Shape<3> dshape = Shape3(leading, in_data[i].shape_[dimension_], trailing);
-      data[i] = in_data[i].get_with_shape<xpu, 3, real_t>(dshape, s);
+      data[i] = in_data[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     Concatenate(data, &out, 1, req[concat_enum::kOut]);
   }
@@ -85,8 +85,8 @@ class ConcatOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3> > grad_in(size_);
-    Tensor<xpu, 3> grad;
+    std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
+    Tensor<xpu, 3, DType> grad;
     size_t leading = 1, trailing = 1;
     for (int i = 0; i < dimension_; ++i) {
       leading *= out_grad[concat_enum::kOut].shape_[i];
@@ -96,11 +96,11 @@ class ConcatOp : public Operator {
     }
     size_t mid = out_grad[concat_enum::kOut].shape_[dimension_];
     Shape<3> oshape = Shape3(leading, mid, trailing);
-    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, real_t>(oshape, s);
+    grad = out_grad[concat_enum::kOut].get_with_shape<xpu, 3, DType>(oshape, s);
 
     for (int i = 0; i < size_; ++i) {
       Shape<3> dshape = Shape3(leading, in_grad[i].shape_[dimension_], trailing);
-      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, real_t>(dshape, s);
+      grad_in[i] = in_grad[i].get_with_shape<xpu, 3, DType>(dshape, s);
     }
     Split(grad, &grad_in, 1, req);
   }
@@ -111,7 +111,7 @@ class ConcatOp : public Operator {
 };  // class ConcatOp
 
 template<typename xpu>
-Operator *CreateOp(ConcatParam param);
+Operator *CreateOp(ConcatParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class ConcatProp : public OperatorProperty {
@@ -162,6 +162,41 @@ class ConcatProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    int dtype = -1;
+
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == dtype ||
+              in_type->at(i) == -1) <<
+              "Non-uniform data type in Concat";
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Not enough information to infer type in Concat.";
+      return false;
+    }
+
+    size_t nin = this->ListArguments().size();
+    in_type->clear();
+    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+    size_t naux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
+
+    size_t nout = this->ListOutputs().size();
+    out_type->clear();
+    for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new ConcatProp();
     ptr->param_ = param_;
@@ -179,7 +214,13 @@ class ConcatProp : public OperatorProperty {
     return out_grad;
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   ConcatParam param_;
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
index 6ea9bc974c5e..579443e3bd13 100644
--- a/src/operator/concat.cc
+++ b/src/operator/concat.cc
@@ -10,12 +10,21 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ConcatParam param) {
-  return new ConcatOp<cpu>(param);
+Operator* CreateOp<cpu>(ConcatParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConcatOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* ConcatProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                       std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ConcatParam);
diff --git a/src/operator/concat.cu b/src/operator/concat.cu
index 4e24b45cc676..fb3cf3862f3e 100644
--- a/src/operator/concat.cu
+++ b/src/operator/concat.cu
@@ -10,8 +10,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ConcatParam param) {
-  return new ConcatOp<gpu>(param);
+Operator* CreateOp<gpu>(ConcatParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConcatOp<gpu, DType>(param);
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 74801d55a557..4a3425fdbdbd 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -25,6 +25,7 @@ namespace conv {
 enum ConvolutionOpInputs {kData, kWeight, kBias};
 enum ConvolutionOpOutputs {kOut};
 enum ConvolutionOpResource {kTempSpace};
+enum ConvolutionOpCudnnTune {kOff, kLimited, kFastest};
 }
 
 struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
@@ -36,26 +37,36 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   uint32_t num_group;
   uint64_t workspace;
   bool no_bias;
+  int cudnn_tune;
   DMLC_DECLARE_PARAMETER(ConvolutionParam) {
     int shape[] = {1, 1};
-    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x)");
+    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x) or (d, y, x)");
     DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
-    .describe("convolution stride: (y, x)");
+    .describe("convolution stride: (y, x) or (d, y, x)");
     DMLC_DECLARE_FIELD(dilate).set_default(TShape(shape, shape + 2))
     .describe("convolution dilate: (y, x)");
     shape[0] = shape[1] = 0;
     DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-    .describe("pad for convolution: (y, x)");
+    .describe("pad for convolution: (y, x) or (d, y, x)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("convolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
     .describe("Number of groups partition. "
               "This option is not supported by CuDNN, you can use SliceChannel to num_group,"
               "apply convolution and concat instead to achieve the same need.");
-    DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
+    DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192)
     .describe("Tmp workspace for convolution (MB).");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
     .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(cudnn_tune)
+    .add_enum("off", conv::kOff)
+    .add_enum("limited_workspace", conv::kLimited)
+    .add_enum("fastest", conv::kFastest)
+    .set_default(dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 0))
+    .describe("Whether to find convolution algo by running performance test."
+              "Leads to higher startup time but may give better speed."
+              "auto tune is turned off by default."
+              "Set environment varialbe MXNET_CUDNN_AUTOTUNE_DEFAULT=1 to turn on by default.");
   }
 };
 
@@ -80,6 +91,9 @@ class ConvolutionOp : public Operator {
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
     Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
@@ -154,6 +168,9 @@ class ConvolutionOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): check the BLAS Handle, be careful
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
     CHECK_EQ(out_grad.size(), 1);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
@@ -289,7 +306,10 @@ class ConvolutionOp : public Operator {
 };  // class ConvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(ConvolutionParam param, int dtype);
+Operator* CreateOp(ConvolutionParam param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
 
 #if DMLC_USE_CXX11
 class ConvolutionProp : public OperatorProperty {
@@ -321,37 +341,85 @@ class ConvolutionProp : public OperatorProperty {
     }
     const TShape &dshape = (*in_shape)[conv::kData];
     if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 4) \
-        << "Input data should be 4D in batch-num_filter-y-x";
-    SHAPE_ASSIGN_CHECK(*in_shape,
-                       conv::kWeight,
-                       Shape4(param_.num_filter, dshape[1] / param_.num_group,
-                              param_.kernel[0], param_.kernel[1]));
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
-    const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-    CHECK_EQ(dshape[1] % param_.num_group, 0) \
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      CHECK_EQ(dshape.ndim(), 4) \
+          << "Input data should be 4D in batch-num_filter-y-x";
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                         conv::kWeight,
+                         Shape4(param_.num_filter, dshape[1] / param_.num_group,
+                                param_.kernel[0], param_.kernel[1]));
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+      out_shape->clear();
+      out_shape->push_back(dshape);
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0) \
+          << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+          << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_y <= dshape[2] + 2 * param_.pad[0]
+            && ksize_x <= dshape[3] + 2 * param_.pad[1])
+          << "kernel size exceed input";
+      (*out_shape)[conv::kOut][1] = param_.num_filter;
+      (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
+          (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
+      (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] -
+          (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
+      return true;
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      CHECK_EQ(dshape.ndim(), 5) \
+        << "Input data should be 5D in batch-num_filter-depth-y-x";
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                         conv::kWeight,
+                         Shape5(param_.num_filter, dshape[1] / param_.num_group,
+                                param_.kernel[0], param_.kernel[1], param_.kernel[2]));
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+      out_shape->clear();
+      out_shape->push_back(dshape);
+      const index_t ksize_d = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[1]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[2]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0) \
-        << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0) \
-        << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0) \
-        << "incorrect stride size: " << param_.stride;
-    CHECK_GT(param_.dilate.Size(), 0) \
-        << "incorrect dilate size: " << param_.dilate;
-    CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
-        << "kernel size exceed input";
-    (*out_shape)[conv::kOut][1] = param_.num_filter;
-    (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
-        (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
-    (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] -
-        (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
-    return true;
+      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+          << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_d < dshape[2] + 2 * param_.pad[0]
+            && ksize_y <= dshape[3] + 2 * param_.pad[1]
+            && ksize_x <= dshape[4] + 2 * param_.pad[2])
+          << "kernel size exceed input";
+      if (param_.dilate.Size() != 1) {
+        LOG(INFO) << "Dilate is not supported in 3d convolution";
+      }
+      (*out_shape)[conv::kOut][1] = param_.num_filter;
+      (*out_shape)[conv::kOut][2] = (dshape[2] + 2 * param_.pad[0] -
+          (1 * (ksize_d - 1) + 1)) / param_.stride[0] + 1;
+      (*out_shape)[conv::kOut][3] = (dshape[3] + 2 * param_.pad[1] -
+          (1 * (ksize_y - 1) + 1)) / param_.stride[1] + 1;
+      (*out_shape)[conv::kOut][4] = (dshape[4] + 2 * param_.pad[2] -
+          (1 * (ksize_x - 1) + 1)) / param_.stride[2] + 1;
+      return true;
+    } else {
+      LOG(FATAL) << "Unknown convolution type";
+      return false;
+    }
   }
 
   bool InferType(std::vector<int> *in_type,
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index f575020f9a89..28fc2e2d0257 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -10,7 +10,10 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ConvolutionParam param, int dtype) {
+Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new ConvolutionOp<cpu, DType>(param);
@@ -19,13 +22,14 @@ Operator* CreateOp<cpu>(ConvolutionParam param, int dtype) {
 }
 
 // DO_BIND_DISPATCH comes from operator_common.h
-Operator *ConvolutionProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
+Operator *ConvolutionProp::CreateOperatorEx(Context ctx,
+                                            std::vector<TShape> *in_shape,
+                                            std::vector<int> *in_type) const {
   std::vector<TShape> out_shape, aux_shape;
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
   CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
 }
 
 DMLC_REGISTER_PARAMETER(ConvolutionParam);
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index 1a77fff616b6..6f5904658e20 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -6,6 +6,7 @@
 */
 
 #include "./convolution-inl.h"
+#include <vector>
 #if MXNET_USE_CUDNN == 1
 #include "./cudnn_convolution-inl.h"
 #endif  // MXNET_USE_CUDNN
@@ -13,12 +14,15 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ConvolutionParam param, int dtype) {
+Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
   Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   if (param.dilate[0] == 1 && param.dilate[1] == 1) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      op = new CuDNNConvolutionOp<DType>(param);
+      op = new CuDNNConvolutionOp<DType>(param, in_shape, out_shape, ctx);
     })
   } else {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/correlation-inl.h b/src/operator/correlation-inl.h
index e6c5c0b39f40..e6453fc5a3b1 100644
--- a/src/operator/correlation-inl.h
+++ b/src/operator/correlation-inl.h
@@ -1,13 +1,11 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file Correlation.cu
- * \brief Correlation pooling operator
+ * \file correlation-inl.h
+ * \brief correlation operator and symbol
  * \author Xu Dong 
 */
-
-#ifndef MXNET_OPERATOR_ROI_POOLING_INL_H_
-#define MXNET_OPERATOR_ROI_POOLING_INL_H_
-
+#ifndef MXNET_OPERATOR_CORRELATION_INL_H_
+#define MXNET_OPERATOR_CORRELATION_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -17,17 +15,14 @@
 #include <utility>
 #include "./mshadow_op.h"
 #include "./operator_common.h"
-
 namespace mxnet {
 namespace op {
-
-// Declare enumeration of input order to make code more intuitive.
-// These enums are only visible within this header
+//  Declare enumeration of input order to make code more intuitive.
+//  These enums are only visible within this header
 namespace Correlation {
 enum  CorrelationOpInputs{kData1, kData2};
-enum  CorrelationOpOutputs{kOut,kTemp1,kTemp2};
-} //  namespace correlation
-
+enum  CorrelationOpOutputs{kOut, kTemp1, kTemp2};
+}  //  namespace Correlation
 struct CorrelationParam : public dmlc::Parameter<CorrelationParam> {
   uint32_t max_displacement;
   uint32_t kernel_size;
@@ -36,71 +31,71 @@ struct CorrelationParam : public dmlc::Parameter<CorrelationParam> {
   uint32_t stride2;
   bool is_multiply;
   DMLC_DECLARE_PARAMETER(CorrelationParam) {
-    DMLC_DECLARE_FIELD(kernel_size).set_default(1).describe("kernel size for Correlation");
-    DMLC_DECLARE_FIELD(max_displacement).set_default(1).describe("Max displacement of Correlation ");
-    DMLC_DECLARE_FIELD(stride1).set_default(1).describe("stride between Correlation");
-    DMLC_DECLARE_FIELD(stride2).set_default(1).describe("stride within neighbourhood");
-    DMLC_DECLARE_FIELD(pad_size).set_default(0).describe("pad for Correlation");
-    DMLC_DECLARE_FIELD(is_multiply).set_default(true).describe("operation type is either multiplication or subduction");
+    DMLC_DECLARE_FIELD(kernel_size).set_default(1)
+    .describe("kernel size for Correlation must be an odd number");
+    DMLC_DECLARE_FIELD(max_displacement).set_default(1)
+    .describe("Max displacement of Correlation ");
+    DMLC_DECLARE_FIELD(stride1).set_default(1)
+    .describe("stride1 quantize data1 globally");
+    DMLC_DECLARE_FIELD(stride2).set_default(1)
+    .describe("stride2 quantize data2 within the neighborhood centered around data1");
+    DMLC_DECLARE_FIELD(pad_size).set_default(0)
+    .describe("pad for Correlation");
+    DMLC_DECLARE_FIELD(is_multiply).set_default(true)
+    .describe("operation type is either multiplication or subduction");
   }
-  
 };
-
 template<typename xpu>
 class CorrelationOp : public Operator {
  public:
   explicit CorrelationOp(CorrelationParam param) {
     this->param_ = param;
   }
-  
-  virtual void Forward( const OpContext &ctx,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
-    
     CHECK_EQ(in_data.size(), 2);
     CHECK_EQ(out_data.size(), 3);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    
     Tensor<xpu, 4> data1 = in_data[Correlation::kData1].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> data2 = in_data[Correlation::kData2].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> out   = out_data[Correlation::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp1  = out_data[Correlation::kTemp1].get<xpu,4,real_t>(s);
-    Tensor<xpu, 4> tmp2  = out_data[Correlation::kTemp2].get<xpu,4,real_t>(s);
-    
+    Tensor<xpu, 4> tmp1  = out_data[Correlation::kTemp1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp2  = out_data[Correlation::kTemp2].get<xpu, 4, real_t>(s);
     tmp1 = 0.0f;
     tmp2 = 0.0f;
-   
+    out = 0.0f;
     CHECK_EQ(data1.CheckContiguous(), true);
     CHECK_EQ(data2.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     CHECK_EQ(tmp1.CheckContiguous(), true);
     CHECK_EQ(tmp2.CheckContiguous(), true);
-    
-    paddedbottomheight = data1.shape_[2] + 2*param_.pad_size;
-    paddedbottomwidth  = data1.shape_[3] + 2*param_.pad_size;
-    kernel_radius_ = (param_.kernel_size -1 )/2;
+    paddedbottomheight = data1.shape_[2] + 2 * param_.pad_size;
+    paddedbottomwidth  = data1.shape_[3] + 2 * param_.pad_size;
+    kernel_radius_ = (param_.kernel_size - 1) / 2;
     border_size_ = param_.max_displacement + kernel_radius_;
     stride1 = param_.stride1;
     stride2 = param_.stride2;
-    top_width_ = ceil((float)(paddedbottomwidth - border_size_*2) / (float)stride1);
-    top_height_ = ceil((float)(paddedbottomheight - border_size_*2) / (float)stride1);
+    top_width_ = ceil(static_cast<float>(paddedbottomwidth - border_size_ * 2)\
+     / static_cast<float>(stride1));
+    top_height_ = ceil(static_cast<float>(paddedbottomheight - border_size_ * 2)\
+     / static_cast<float>(stride1));
     neighborhood_grid_radius_ = param_.max_displacement / stride2;
     neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;
     top_channels_ = neighborhood_grid_width_ * neighborhood_grid_width_;
-    
     num =  data1.shape_[0];
     channels = data1.shape_[1];
     height = data1.shape_[2];
     width = data1.shape_[3];
-     
-    CorrelationForward(out, data1, data2,  tmp1,tmp2,top_channels_,top_height_,top_width_,
-                       param_.pad_size,param_.is_multiply,param_.max_displacement,param_.kernel_size,neighborhood_grid_radius_,
-                       neighborhood_grid_width_,kernel_radius_,param_.stride1,param_.stride2);
+    CorrelationForward(out, data1, data2, tmp1, tmp2, top_channels_, top_height_, top_width_,
+                       param_.pad_size, param_.is_multiply,
+                       param_.max_displacement, param_.kernel_size,
+                       neighborhood_grid_radius_, neighborhood_grid_width_,
+                       kernel_radius_, param_.stride1, param_.stride2);
   }
-  
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -110,141 +105,130 @@ class CorrelationOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad_data1 = in_grad[Correlation::kData1].get<xpu,4,real_t>(s);
-    Tensor<xpu, 4> grad_data2 = in_grad[Correlation::kData2].get<xpu,4,real_t>(s);
+    Tensor<xpu, 4> grad_data1 = in_grad[Correlation::kData1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_data2 = in_grad[Correlation::kData2].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> out_g = out_grad[Correlation::kOut].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> tmp1 = out_data[Correlation::kTemp1].get<xpu,4,real_t>(s);
-    Tensor<xpu, 4> tmp2 = out_data[Correlation::kTemp2].get<xpu,4,real_t>(s);
-    
+    Tensor<xpu, 4> tmp1 = out_data[Correlation::kTemp1].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> tmp2 = out_data[Correlation::kTemp2].get<xpu, 4, real_t>(s);
     CHECK_EQ(grad_data1.CheckContiguous(), true);
     CHECK_EQ(grad_data2.CheckContiguous(), true);
     CHECK_EQ(out_g.CheckContiguous(), true);
     CHECK_EQ(tmp1.CheckContiguous(), true);
     CHECK_EQ(tmp2.CheckContiguous(), true);
-    
-    CorrelationBackward(out_g,grad_data1,grad_data2,tmp1,tmp2,top_channels_,top_height_,top_width_,param_.pad_size,param_.is_multiply,
-      param_.max_displacement,param_.kernel_size,neighborhood_grid_radius_,neighborhood_grid_width_,kernel_radius_,param_.stride1,param_.stride2,num,channels,height, width);
-  } 
-  private:
-  CorrelationParam param_;
-   int paddedbottomheight;
+    CorrelationBackward(out_g, grad_data1, grad_data2, tmp1, tmp2, top_channels_,
+    top_height_, top_width_, param_.pad_size, param_.is_multiply,
+    param_.max_displacement, param_.kernel_size, neighborhood_grid_radius_,
+    neighborhood_grid_width_, kernel_radius_, param_.stride1, param_.stride2,
+    num, channels, height, width);
+  }
+
+ private:
+    CorrelationParam param_;
+    int paddedbottomheight;
     int paddedbottomwidth;
-    uint32_t kernel_radius_; 
+    uint32_t kernel_radius_;
     uint32_t border_size_;
-    uint32_t stride1 ;
-    uint32_t stride2 ;
-    uint32_t top_width_; 
-    uint32_t top_height_; 
-    uint32_t neighborhood_grid_radius_; 
-    uint32_t neighborhood_grid_width_ ;
-    uint32_t top_channels_ ;
+    uint32_t stride1;
+    uint32_t stride2;
+    uint32_t top_width_;
+    uint32_t top_height_;
+    uint32_t neighborhood_grid_radius_;
+    uint32_t neighborhood_grid_width_;
+    uint32_t top_channels_;
     int  num;
     int  channels;
     int  height;
     int  width;
-}; // class CorrelationOp
-
-
-// Decalre Factory function
+};   //  class CorrelationOp
+//  Decalre Factory function
 template<typename xpu>
 Operator* CreateOp(CorrelationParam param);
-
 #if DMLC_USE_CXX11
 class CorrelationProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     return {"data1", "data2"};
   }
-
   std::vector<std::string> ListOutputs() const override {
-    return {"output","tmp1","tmp2"};
+    return {"output", "tmp1", "tmp2"};
   }
-
   int NumOutputs() const override {
     return 3;
   }
-
   int NumVisibleOutputs() const override {
     return 1;
   }
-
- void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     param_.Init(kwargs);
   }
-
   std::map<std::string, std::string> GetParams() const override {
     return param_.__DICT__();
   }
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
-    
     using namespace mshadow;
     CHECK_EQ(in_shape->size(), 2) << "Input:[data1, data2]";
     TShape dshape1 = in_shape->at(Correlation::kData1);
     TShape dshape2 = in_shape->at(Correlation::kData2);
     CHECK_EQ(dshape1.ndim(), 4) << "data should be a 4D tensor";
     CHECK_EQ(dshape2.ndim(), 4) << "data should be a 4D tensor";
-    
     int paddedbottomheight;
     int paddedbottomwidth;
-    uint32_t kernel_radius_; 
-    uint32_t stride1 ;
-    uint32_t stride2 ;
-    uint32_t top_width_; 
-    uint32_t top_height_; 
-    uint32_t neighborhood_grid_radius_; 
-    uint32_t neighborhood_grid_width_ ;
-    uint32_t top_channels_ ;
+    uint32_t kernel_radius_;
+    uint32_t stride1;
+    uint32_t stride2;
+    uint32_t top_width_;
+    uint32_t top_height_;
+    uint32_t neighborhood_grid_radius_;
+    uint32_t neighborhood_grid_width_;
+    uint32_t top_channels_;
     uint32_t border_size_;
-    
     paddedbottomheight = dshape1[2] + 2*param_.pad_size;
     paddedbottomwidth  = dshape1[3] + 2*param_.pad_size;
-    kernel_radius_ = (param_.kernel_size -1 )/2;
+    kernel_radius_ = (param_.kernel_size -1)/2;
     border_size_ = param_.max_displacement + kernel_radius_;
     stride1 = param_.stride1;
     stride2 = param_.stride2;
-    top_width_ = ceil((float)(paddedbottomwidth - border_size_*2) / (float)stride1);
-    top_height_ = ceil((float)(paddedbottomheight - border_size_*2) / (float)stride1);
+    top_width_ = ceil(static_cast<float>(paddedbottomwidth - border_size_ * 2)\
+     / static_cast<float>(stride1));
+    top_height_ = ceil(static_cast<float>(paddedbottomheight - border_size_ * 2)\
+     / static_cast<float>(stride1));
     neighborhood_grid_radius_ = param_.max_displacement / stride2;
     neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;
     top_channels_ = neighborhood_grid_width_ * neighborhood_grid_width_;
-    
-    CHECK_GE(top_width_, 1) << "Correlation cannot be done with current settings. Neighborhood and kernel don't fit in blob";
-    CHECK_GE(top_height_, 1) << "Correlation cannot be done with current settings. Neighborhood and kernel don't fit in blob";
-    
+    CHECK_GE(top_width_, 1) <<
+    "Correlation cannot be done with current settings.Neighborhood and kernel don't fit in blob";
+    CHECK_GE(top_height_, 1) <<
+    "Correlation cannot be done with current settings.Neighborhood and kernel don't fit in blob";
     out_shape->clear();
-    out_shape->push_back(Shape4(dshape1[0],top_channels_,top_height_,top_width_));
-    out_shape->push_back(Shape4(dshape1[0],paddedbottomheight, paddedbottomwidth,dshape1[1]));
-    out_shape->push_back(Shape4(dshape1[0],paddedbottomheight, paddedbottomwidth,dshape1[1]));
+    out_shape->push_back(Shape4(dshape1[0], top_channels_, top_height_, top_width_));
+    out_shape->push_back(Shape4(dshape1[0], paddedbottomheight, paddedbottomwidth, dshape1[1]));
+    out_shape->push_back(Shape4(dshape1[0], paddedbottomheight, paddedbottomwidth, dshape1[1]));
     return true;
   }
-
   OperatorProperty* Copy() const override {
     CorrelationProp* Correlation_sym = new CorrelationProp();
     Correlation_sym->param_ = this->param_;
     return Correlation_sym;
   }
-
   std::string TypeString() const override {
     return "Correlation";
   }
-
-  // decalre dependency and inplace optimization options
+  //  decalre dependency and inplace optimization options
   std::vector<int> DeclareBackwardDependency(
-	    const std::vector<int> &out_grad,
-	    const std::vector<int> &in_data,
-	    const std::vector<int> &out_data) const override
-	 { 
-	    return {out_grad[Correlation::kOut], out_data[Correlation::kTemp1], out_data[Correlation::kTemp2]};
-   } 
-  
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+     return {out_grad[Correlation::kOut],
+     out_data[Correlation::kTemp1], out_data[Correlation::kTemp2]};
+}
   Operator* CreateOperator(Context ctx) const override;
 
  private:
-    CorrelationParam param_;
-};  // class CorrelationProp
+  CorrelationParam param_;
+};  //  class CorrelationProp
 #endif
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_Correlation_INL_H_
+}  //  namespace op
+}  //  namespace mxnet
+#endif  //  MXNET_OPERATOR_CORRELATION_INL_H_
diff --git a/src/operator/correlation.cc b/src/operator/correlation.cc
index 70e2ac35a328..bcd829fff117 100644
--- a/src/operator/correlation.cc
+++ b/src/operator/correlation.cc
@@ -1,61 +1,144 @@
 /*!
  * Copyright (c) 2015 by Contributors
  * \file correlation.cc
- * \brief
+ * \brief correlation op
  * \author Xu Dong
 */
-
 #include "./correlation-inl.h"
+#include "./mshadow_op.h"
 
 namespace mshadow {
-  
 template<typename Dtype>
-inline void CorrelationForward( const Tensor<cpu, 4, Dtype> &out,
-                                const Tensor<cpu, 4, Dtype> &data1,
-                                const Tensor<cpu, 4, Dtype> &data2,
-                                const Tensor<cpu, 4, Dtype> &tmp1,
-                                const Tensor<cpu, 4, Dtype> &tmp2,
-                                int top_channels_,int top_height_,int top_width_,int pad_size_,bool is_multiply,
-                                int max_displacement_,int kernel_size_,int neighborhood_grid_radius_,int neighborhood_grid_width_,
-                                int  kernel_radius_,int stride1_,int stride2_
-                           ) {
-  return  ; 
+void AddPad(const Tensor<cpu, 4, Dtype> &original,
+            const Tensor<cpu, 4, Dtype> &out,
+            int pad_size)
+{ for (index_t nbatch = 0 ; nbatch < original.size(0) ; nbatch++)
+  for (index_t channel = 0 ; channel < original.size(1) ; channel++)
+    for (index_t h = 0 ; h < original.size(2) ; h++)
+      for (index_t w = 0 ; w < original.size(3) ; w++)
+         out[nbatch][h+pad_size][w+pad_size][channel] = original[nbatch][channel][h][w];
+}
+template<typename Dtype>
+inline void CorrelationForward(const Tensor<cpu, 4, Dtype> &out,
+                               const Tensor<cpu, 4, Dtype> &data1,
+                               const Tensor<cpu, 4, Dtype> &data2,
+                               const Tensor<cpu, 4, Dtype> &tmp1,
+                               const Tensor<cpu, 4, Dtype> &tmp2,
+                               int top_channels_, int top_height_, int top_width_,
+                               int pad_size_, bool is_multiply,
+                               int max_displacement_, int kernel_size_,
+                               int neighborhood_grid_radius_, int neighborhood_grid_width_,
+                               int  kernel_radius_, int stride1_, int stride2_) {
+  const int bnum = data1.size(0);
+  const int bchannels = data1.size(1);
+  const int sumelems = kernel_size_ * kernel_size_ * bchannels;
+  AddPad<Dtype>(data1, tmp1, pad_size_);
+  AddPad<Dtype>(data2, tmp2, pad_size_);
+  for (index_t i = 0 ; i < top_height_ ; i++)
+      for (index_t j = 0 ; j < top_width_; j++)
+        for (index_t nbatch = 0 ; nbatch < bnum ; nbatch++) {
+            int x1 = j*stride1_+max_displacement_;
+            int y1 = i*stride1_+max_displacement_;
+            for (index_t top_channel = 0 ; top_channel < top_channels_ ; top_channel++) {
+              int s2o = (top_channel % neighborhood_grid_width_ -\
+                         neighborhood_grid_radius_) * stride2_;
+              int s2p = (top_channel / neighborhood_grid_width_ -\
+                         neighborhood_grid_radius_) * stride2_;
+              int x2 = x1 + s2o;
+              int y2 = y1 + s2p;
+              for (index_t h = 0; h < kernel_size_; h++)
+                for (index_t w = 0; w < kernel_size_; w++)
+                  for (index_t channel = 0; channel < bchannels; channel++) {
+                    if (is_multiply == true)
+                        out[nbatch][top_channel][i][j] += \
+                        tmp1[nbatch][y1+h][x1+w][channel]*tmp2[nbatch][y2+h][x2+w][channel];
+                    else
+                        out[nbatch][top_channel][i][j] += \
+                        fabsf(tmp1[nbatch][y1+h][x1+w][channel]-tmp2[nbatch][y2+h][x2+w][channel]);
+                  }
+              out[nbatch][top_channel][i][j] /= sumelems;
+            }
+        }
 }
-
 template<typename Dtype>
 inline void CorrelationBackward(const Tensor<cpu, 4, Dtype> &out_grad,
-                            const Tensor<cpu, 4, Dtype> &in_grad1,
-                            const Tensor<cpu, 4, Dtype> &in_grad2,
-                            const Tensor<cpu, 4, Dtype> &tmp1, 
-                            const Tensor<cpu, 4, Dtype> &tmp2,
-                            int top_channels_,int top_height_,int top_width_,int pad_size_,bool is_multiply,
-                            int max_displacement_,int kernel_size_,int neighborhood_grid_radius_,int neighborhood_grid_width_,
-                            int  kernel_radius_,int stride1_,int stride2_,int num, int channels,int height, int width
+                                const Tensor<cpu, 4, Dtype> &in_grad1,
+                                const Tensor<cpu, 4, Dtype> &in_grad2,
+                                const Tensor<cpu, 4, Dtype> &tmp1,
+                                const Tensor<cpu, 4, Dtype> &tmp2,
+                                int top_channels_, int top_height_,
+                                int top_width_, int pad_size_,
+                                bool is_multiply, int max_displacement_,
+                                int kernel_size_, int neighborhood_grid_radius_,
+                                int neighborhood_grid_width_,
+                                int  kernel_radius_, int stride1_,
+                                int stride2_, int num,
+                                int channels, int height, int width
                             ) {
-                              
- return ; 
+  const float sumelems = kernel_size_ * kernel_size_ * channels;
+  for (int i = 0 ; i < top_height_ ; i++)
+     for (int j = 0 ; j < top_width_; j++)
+        for (int nbatch = 0 ; nbatch < num ; nbatch++) {
+            int x1 = j*stride1_+max_displacement_;
+            int y1 = i*stride1_+max_displacement_;
+            for (int top_channel = 0 ; top_channel < top_channels_ ; top_channel++) {
+              int s2o = (top_channel % neighborhood_grid_width_ - \
+              neighborhood_grid_radius_) * stride2_;
+              int s2p = (top_channel / neighborhood_grid_width_ - \
+              neighborhood_grid_radius_) * stride2_;
+              int x2 = x1 + s2o;
+              int y2 = y1 + s2p;
+              for (int h = 0; h < kernel_size_; h++)
+                for (int w = 0; w < kernel_size_; w++)
+                  for (int channel = 0 ; channel < channels; channel++) {
+                    if (is_multiply == true) {
+                      if ((y1 +  h - pad_size_ >= 0) && (x1 + w - pad_size_ >= 0) && \
+                      (y1 + h < height +pad_size_) && (x1 + w < width + pad_size_)) {
+                        in_grad1[nbatch][channel][y1+h-pad_size_][x1+w-pad_size_] += \
+                        out_grad[nbatch][top_channel][i][j] * \
+                        tmp2[nbatch][y2+h][x2+w][channel]/sumelems;
+                       }
+                       if ((y2 +  h - pad_size_ >= 0) && (x2 + w -pad_size_ >=0) && \
+                       (y2 + h < height +pad_size_) && (x2 + w < width + pad_size_)) {
+                       in_grad2[nbatch][channel][y2+h-pad_size_][x2+w-pad_size_] += \
+                       out_grad[nbatch][top_channel][i][j] * \
+                       tmp1[nbatch][y1+h][x1+w][channel]/sumelems;
+                       }
+                    } else {
+                      if ((y1 +  h - pad_size_ >= 0) && (x1 + w -pad_size_ >=0) && \
+                      (y1 + h < height + pad_size_) && (x1 + w < width + pad_size_)) {
+                        Dtype sign  = (tmp1[nbatch][y1+h][x1+w][channel] >= \
+                        tmp2[nbatch][y2+h][x2+w][channel])? Dtype(1.0) : Dtype(-1.0);
+                        in_grad1[nbatch][channel][y1+h-pad_size_][x1+w-pad_size_] +=\
+                        out_grad[nbatch][top_channel][i][j]*sign/sumelems;
+                      }
+                      if ((y2 +  h - pad_size_ >= 0) && (x2 + w - pad_size_ >=0) && \
+                      (y2 + h < height + pad_size_) && (x2 + w < width + pad_size_)) {
+                        Dtype sign  = (tmp1[nbatch][y1+h][x1+w][channel] >= \
+                        tmp2[nbatch][y2+h][x2+w][channel])? Dtype(-1.0) : Dtype(1.0);
+                        in_grad2[nbatch][channel][y2+h-pad_size_][x2+w-pad_size_] +=\
+                        out_grad[nbatch][top_channel][i][j]*sign/sumelems;
+                       }
+                    }
+                  }
+               }
+         }
 }
 }  // namespace mshadow
-
-
 namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>(CorrelationParam param) {
   return new CorrelationOp<cpu>(param);
 }
-
 Operator* CorrelationProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp, param_);
 }
-
 DMLC_REGISTER_PARAMETER(CorrelationParam);
-
 MXNET_REGISTER_OP_PROPERTY(Correlation, CorrelationProp)
 .describe("Apply correlation to inputs")
-.add_argument("data1", "Symbol", "Input data to the correlation.")
-.add_argument("data2", "Symbol", "Input data to the correlation.")
+.add_argument("data1", "Symbol", "Input data1 to the correlation.")
+.add_argument("data2", "Symbol", "Input data2 to the correlation.")
 .add_arguments(CorrelationParam::__FIELDS__());
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/correlation.cu b/src/operator/correlation.cu
index 884afb3b0ec3..b26ae04f2d0b 100644
--- a/src/operator/correlation.cu
+++ b/src/operator/correlation.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright [2016] <Contributors>
  * \file Correation.cu
  * \brief  Correlation operator
  * \author Xu Dong
@@ -12,212 +12,196 @@
 
 #define ROUND_OFF 50000
 #define WARPS_PER_BLOCK 1
-#define THREADS_PER_WARP 32 
-
+#define THREADS_PER_WARP 32
 #define CORRELATION_CUDA_CHECK(condition) \
   /* Code block avoids redefinition of cudaError_t error */ \
   do { \
     cudaError_t error = condition; \
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
   } while (0)
-  
 #define CUDA_KERNEL_LOOP(i, n) \
 for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
       i < (n); \
       i += blockDim.x * gridDim.x)
-      
 namespace mshadow {
 namespace cuda {
-  
 // == Correlation Kernel
-template <typename Dtype> 
-__global__ void CorrelateData(const int nthreads, int num, int topwidth, int topheight, int topchannels, int topcount,
-  int max_displacement, int neighborhood_grid_radius, int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2,
+template <typename Dtype>
+__global__ void CorrelateData(const int nthreads, int num, int topwidth,
+  int topheight, int topchannels, int topcount,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int kernel_size, int stride1, int stride2,
   int bottomwidth, int bottomheight, int bottomchannels,
-  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) 
-{
+  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) {
   extern __shared__ char patch_data_char[];
-  
-  Dtype *patch_data = (Dtype *)patch_data_char;
-  
-  // First (upper left) position of kernel upper-left corner in current center position of neighborhood in image 1
-  int x1 = blockIdx.x*stride1 + max_displacement;
-  int y1 = blockIdx.y*stride1 + max_displacement;
-  int item = blockIdx.z; 
+  Dtype *patch_data = reinterpret_cast<Dtype *>(patch_data_char);
+  //  First (upper left) position of kernel upper-left corner
+  //  in current center position of neighborhood in image 1
+  int x1 = blockIdx.x * stride1 + max_displacement;
+  int y1 = blockIdx.y * stride1 + max_displacement;
+  int item = blockIdx.z;
   int ch_off = threadIdx.x;
-  
-  // Load 3D patch into shared shared memory
-  for(int j = 0; j < kernel_size; j++) { // HEIGHT
-    for(int i = 0; i < kernel_size; i++) { // WIDTH
+  //  Load 3D patch into shared shared memory
+  for (int j = 0; j < kernel_size; j++) {  //  HEIGHT
+    for (int i = 0; i < kernel_size; i++) {  //  WIDTH
       int ji_off = ((j * kernel_size) + i) * bottomchannels;
-      for(int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK) )  { // CHANNELS
+      for (int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK))  {
+          //  CHANNELS
           int idx1 = ((item * bottomheight + y1+j) * bottomwidth + x1+i) * bottomchannels + ch;
           int idxPatchData = ji_off + ch;
           patch_data[idxPatchData] = bottom0[idx1];
       }
     }
   }
-  
   __syncthreads();
-  
   __shared__ Dtype sum[THREADS_PER_WARP * WARPS_PER_BLOCK];
-  
-  // Compute correlation
-  for(int top_channel = 0; top_channel < topchannels; top_channel++) {
+  //  Compute correlation
+  for (int top_channel = 0; top_channel < topchannels; top_channel++) {
     sum[ch_off] = 0;
-  
     int s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2;
     int s2p = (top_channel / neighborhood_grid_width - neighborhood_grid_radius) * stride2;
-    
-    for(int j = 0; j < kernel_size; j++) { // HEIGHT
-      for(int i = 0; i < kernel_size; i++) { // WIDTH
+    for (int j = 0; j < kernel_size; j++) {  //  HEIGHT
+      for (int i = 0; i < kernel_size; i++) {  //  WIDTH
         int ji_off = ((j * kernel_size) + i) * bottomchannels;
-        for(int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK)) { // CHANNELS
+        for (int ch = ch_off; ch < bottomchannels; ch += (THREADS_PER_WARP * WARPS_PER_BLOCK)) {
+          //  CHANNELS
           int x2 = x1 + s2o;
           int y2 = y1 + s2p;
-          
           int idxPatchData = ji_off + ch;
-          int idx2 = ((item * bottomheight + y2+j) * bottomwidth + x2+i) * bottomchannels + ch;
-          
+          int idx2 = ((item * bottomheight + y2 + j) * bottomwidth + x2 + i) * bottomchannels + ch;
           sum[ch_off] += patch_data[idxPatchData] * bottom1[idx2];
         }
       }
     }
-    
     __syncthreads();
-    
-    if(ch_off == 0) {
+    if (ch_off == 0) {
         Dtype total_sum = 0;
-        for(int idx = 0; idx < THREADS_PER_WARP * WARPS_PER_BLOCK; idx++) {
+        for (int idx = 0; idx < THREADS_PER_WARP * WARPS_PER_BLOCK; idx++) {
             total_sum += sum[idx];
         }
-        const int sumelems = kernel_size*kernel_size*bottomchannels;
-        const int index = ((top_channel*topheight + blockIdx.y)*topwidth)+blockIdx.x;
-        top[index + item*topcount] = total_sum / (float)sumelems;  
-    }  // Aggregate result of  different threads
+        const int sumelems = kernel_size * kernel_size * bottomchannels;
+        const int index = ((top_channel * topheight + blockIdx.y) * topwidth) + blockIdx.x;
+        top[index + item*topcount] = total_sum / static_cast<float>(sumelems);
+    }  //  Aggregate result of  different threads
   }
 }
-
-// == Correlation Backward Pass Kernel (For data1)
-template <typename Dtype> 
-__global__ void CorrelateDataBackward0(const int nthreads, int num, int item, int topwidth, int topheight, int topchannels,
-  int max_displacement, int neighborhood_grid_radius, int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
-  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight, int bottomchannels, int bottomcount, int pad_size,
-  Dtype *bottom0diff, const Dtype *bottom1, const Dtype *topdiff) 
-{
+//  == Correlation Backward Pass Kernel (For data1)
+template <typename Dtype>
+__global__ void CorrelateDataBackward0(const int nthreads, int num, int item,
+  int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
+  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight,
+  int bottomchannels, int bottomcount, int pad_size,
+  Dtype *bottom0diff, const Dtype *bottom1, const Dtype *topdiff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    int n = index % bottomchannels; //channels
-    int l = (index / bottomchannels) % bottomwidth + pad_size; //w-pos
-    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size; //h-pos
-
-    //Get X,Y ranges and clamp
-    // round_off is a trick to enable integer division with ceil, even for negative numbers
-    // We use a large offset, for the inner part not to become negative.
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  Get X,Y ranges and clamp
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
-    
-    // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
-    int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
-    int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
-    
-    // Same here:
-    int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off; // floor (l - max_displacement) / stride1
-    int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off; // floor (m - max_displacement) / stride1
-    
-
+    //  We add round_off before_s1 the int division and subtract round_off after it,
+    //  to ensure the formula matches ceil behavior:
+    int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    //  Same here:
+    int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (l - max_displacement) / stride1
+    int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (m - max_displacement) / stride1
     Dtype sum = 0;
-    if(xmax>=0 && ymax>=0 && (xmin<=topwidth-1) && (ymin<=topheight-1))
-    {
-        xmin = max(0,xmin);
-        xmax = min(topwidth-1,xmax);
-
-        ymin = max(0,ymin);
-        ymax = min(topheight-1,ymax);
-
-        for(int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
-          for(int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
-
-            // Get bottom1 data:
+    if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth-1) && (ymin <= topheight-1)) {
+        xmin = max(0, xmin);
+        xmax = min(topwidth-1, xmax);
+        ymin = max(0, ymin);
+        ymax = min(topheight-1, ymax);
+        for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+          for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+            //  Get bottom1 data:
             int s2o = stride2 * o;
             int s2p = stride2 * p;
-            int idxbot1 = ((item * pbottomheight + (m+s2p)) * pbottomwidth + (l+s2o)) * bottomchannels + n;
-            Dtype bot1tmp = bottom1[idxbot1]; // bottom1[l+s2o,m+s2p,n]
-
-            // Index offset for topdiff in following loops:
-            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width + (o+neighborhood_grid_radius); // index [o,p]
+            int idxbot1 = ((item * pbottomheight + (m + s2p)) * pbottomwidth + (l + s2o))\
+             * bottomchannels + n;
+            Dtype bot1tmp = bottom1[idxbot1];  // bottom1[l+s2o,m+s2p,n]
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width\
+             + (o + neighborhood_grid_radius);  //  index [o,p]
             int idxopoffset = (item * topchannels + op);
-
-            for(int y = ymin; y <= ymax; y++) {
-              for(int x = xmin; x <= xmax; x++) {
-                int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x; // topdiff[x,y,o,p]
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * bot1tmp;
               }
             }
           }
         }
     }
-    const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
-		const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
-    bottom0diff[bot0index + item*bottomcount] = sum / (float)sumelems;
+    const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2+1) * bottomchannels;
+    const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
+    bottom0diff[bot0index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
-
 }
-
 // == Correlation Backward Pass Kernel (For Blob 1)
-template <typename Dtype> 
-__global__ void CorrelateDataBackward1(const int nthreads, int num, int item, int topwidth, int topheight, int topchannels,
-  int max_displacement, int neighborhood_grid_radius, int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
-  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight, int bottomchannels, int bottomcount, int pad_size,
-  const Dtype *bottom0, Dtype *bottom1diff, const Dtype *topdiff) 
-{
+template <typename Dtype>
+__global__ void CorrelateDataBackward1(const int nthreads,
+  int num, int item, int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
+  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight,
+  int bottomchannels, int bottomcount, int pad_size,
+  const Dtype *bottom0, Dtype *bottom1diff, const Dtype *topdiff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    //int l = index % bottomwidth + pad_size; //w-pos
-    //int m = (index / bottomwidth) % bottomheight + pad_size; //h-pos
-    //int n = (index / bottomwidth / bottomheight) % bottomchannels; //channels
-    int n = index % bottomchannels; //channels
-    int l = (index / bottomchannels) % bottomwidth + pad_size; //w-pos
-    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size; //h-pos
-    
-    // round_off is a trick to enable integer division with ceil, even for negative numbers
-    // We use a large offset, for the inner part not to become negative.
+    //  int l = index % bottomwidth + pad_size; //w-pos
+    //  int m = (index / bottomwidth) % bottomheight + pad_size; //  h-pos
+    //  int n = (index / bottomwidth / bottomheight) % bottomchannels; //  channels
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
-    
     Dtype sum = 0;
-    for(int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
-      for(int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
-        
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
         int s2o = stride2 * o;
         int s2p = stride2 * p;
-        
-        //Get X,Y ranges and clamp
-        // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
-        int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
-        int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
-        
-        // Same here:
-        int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off; // floor (l - max_displacement - s2o) / stride1
-        int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off; // floor (m - max_displacement - s2p) / stride1
-
-        if(xmax>=0 && ymax>=0 && (xmin<=topwidth-1) && (ymin<=topheight-1))
-        {
-            xmin = max(0,xmin);
-            xmax = min(topwidth-1,xmax);
-
-            ymin = max(0,ymin);
-            ymax = min(topheight-1,ymax);
-
-            // Get bottom0 data:
-            int idxbot0 = ((item * pbottomheight + (m-s2p)) * pbottomwidth + (l-s2o)) * bottomchannels + n;
-            Dtype bot0tmp = bottom0[idxbot0]; // bottom1[l+s2o,m+s2p,n]
-
-            // Index offset for topdiff in following loops:
-            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width + (o+neighborhood_grid_radius); // index [o,p]
+        //  Get X,Y ranges and clamp
+        //  We add round_off before_s1 the int division and subtract round_off after it,
+        //  to ensure the formula matches ceil behavior:
+        int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+         // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+        // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        //  Same here:
+        int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off;
+        //  floor (l - max_displacement - s2o) / stride1
+        int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off;
+        //  floor (m - max_displacement - s2p) / stride1
+        if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth - 1) && (ymin <= topheight - 1)) {
+            xmin = max(0, xmin);
+            xmax = min(topwidth-1, xmax);
+            ymin = max(0, ymin);
+            ymax = min(topheight-1, ymax);
+            //  Get bottom0 data:
+            int idxbot0 = ((item * pbottomheight + (m - s2p)) \
+            * pbottomwidth + (l - s2o)) * bottomchannels + n;
+            Dtype bot0tmp = bottom0[idxbot0];  //  bottom1[l+s2o,m+s2p,n]
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * \
+            neighborhood_grid_width + (o+neighborhood_grid_radius);  //  index [o,p]
             int idxOpOffset = (item * topchannels + op);
-
-            for(int y = ymin; y <= ymax; y++) {
-              for(int x = xmin; x <= xmax; x++) {
-                int idxtopdiff = (idxOpOffset * topheight + y) * topwidth + x; // topdiff[x,y,o,p]
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxOpOffset * topheight + y)\
+                 * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * bot0tmp;
               }
             }
@@ -225,175 +209,178 @@ __global__ void CorrelateDataBackward1(const int nthreads, int num, int item, in
       }
     }
     const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
-		const int bot1index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
-		bottom1diff[bot1index + item*bottomcount] = sum / (float)sumelems;
+    const int bot1index = ((n * bottomheight) + (m - pad_size)) * bottomwidth + (l - pad_size);
+    bottom1diff[bot1index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
-
 }
-
 // == Correlation Kernel Subtraction
-template <typename Dtype> 
-__global__ void CorrelateDataSubtract(const int nthreads, int num, int item, int topwidth, int topheight, int topchannels, int topcount,
-  int max_displacement, int neighborhood_grid_radius, int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
+template <typename Dtype>
+__global__ void CorrelateDataSubtract(const int nthreads, int num, int item,
+  int topwidth, int topheight, int topchannels, int topcount,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
   int bottomwidth, int bottomheight, int bottomchannels,
-  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) 
-{
+  const Dtype *bottom0, const Dtype *bottom1, Dtype *top) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    int x = index % topwidth; //w-pos
-    int y = (index / topwidth) % topheight; //h-pos
-    int c = (index / topwidth / topheight) % topchannels; //channels
-        
-    // Offset of patch in image 2
+    int x = index % topwidth;  //  w-pos
+    int y = (index / topwidth) % topheight;  //  h-pos
+    int c = (index / topwidth / topheight) % topchannels;  //  channels
+    //  Offset of patch in image 2
     int s2o = (c % neighborhood_grid_width - neighborhood_grid_radius) * stride2;
     int s2p = (c / neighborhood_grid_width - neighborhood_grid_radius) * stride2;
-        
-    // First (upper left) position of kernel center in current neighborhood in image 1
+    //  First (upper left) position of kernel center in current neighborhood in image 1
     int x1 = x*stride1 + kernel_radius + max_displacement;
     int y1 = y*stride1 + kernel_radius + max_displacement;
-    
-    // Iterate through 3D patch
+    //  Iterate through 3D patch
     Dtype sum = 0;
-    for(int j = -kernel_radius; j <= kernel_radius; j++) { // HEIGHT
-      for(int i = -kernel_radius; i <= kernel_radius; i++) { // WIDTH
-        for(int l = 0; l < bottomchannels; l++) { // CHANNELS
-          // Calculate position in image 2
+    for (int j = -kernel_radius; j <= kernel_radius; j++) {  //  HEIGHT
+      for (int i = -kernel_radius; i <= kernel_radius; i++) {  //  WIDTH
+        for (int l = 0; l < bottomchannels; l++) {  //  CHANNELS
+          //  Calculate position in image 2
           int x2 = x1 + s2o;
           int y2 = y1 + s2p;
-
-          // Indices in bottom data: (CH=l,W=x2,H=y2,N)
-          int idx1 = ((item * bottomheight + y1+j) * bottomwidth + x1+i) * bottomchannels + l;
-          int idx2 = ((item * bottomheight + y2+j) * bottomwidth + x2+i) * bottomchannels + l;
-
-          // Do the correlation:
+          //  Indices in bottom data: (CH=l,W=x2,H=y2,N)
+          int idx1 = ((item * bottomheight + y1 + j) * bottomwidth + x1 + i) \
+          * bottomchannels + l;
+          int idx2 = ((item * bottomheight + y2 + j) * bottomwidth + x2 + i) \
+          * bottomchannels + l;
+          //  Do the correlation:
           sum += fabsf(bottom0[idx1] - bottom1[idx2]);
         }
       }
     }
-    const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
-    top[index + item*topcount] = sum / (float)sumelems;
+    const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2 + 1) * bottomchannels;
+    top[index + item * topcount] = sum / static_cast<float>(sumelems);
   }
-
 }
-
-
-// == Correlation Backward Pass Kernel (For Blob 0)
-template <typename Dtype> 
-__global__ void CorrelateDataBackward0Subtract(const int nthreads, int num, int item, int topwidth, int topheight, int topchannels,
-  int max_displacement, int neighborhood_grid_radius, int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
-  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight, int bottomchannels, int bottomcount, int pad_size,
-  Dtype *bottom0diff, const Dtype *bottom0, const Dtype *bottom1, const Dtype *topdiff) 
-{
+//  == Correlation Backward Pass Kernel (For Blob 0)
+template <typename Dtype>
+__global__ void CorrelateDataBackward0Subtract(const int nthreads, int num,
+  int item, int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius,
+  int stride1, int stride2, int bottomwidth, int bottomheight,
+  int pbottomwidth, int pbottomheight,
+  int bottomchannels, int bottomcount, int pad_size,
+  Dtype *bottom0diff, const Dtype *bottom0, const Dtype *bottom1, const Dtype *topdiff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
-    int l = index % bottomwidth + pad_size; //w-pos
-    int m = (index / bottomwidth) % bottomheight + pad_size; //h-pos
-    int n = (index / bottomwidth / bottomheight) % bottomchannels; //channels
-
-    //Get X,Y ranges and clamp
-    // round_off is a trick to enable integer division with ceil, even for negative numbers
-    // We use a large offset, for the inner part not to become negative.
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  Get X,Y ranges and clamp
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
-    
-    // We add round_off before_s1 the int division and subtract round_off after it, to ensure the formula matches ceil behavior:
-    int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
-    int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1) / stride1 + 1 - round_off; // ceil (l - 2*kernel_radius - max_displacement) / stride1
-    
-    // Same here:
-    int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off; // floor (l - max_displacement) / stride1
-    int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off; // floor (m - max_displacement) / stride1
-    
-
+    int idxbot0 = ((item * pbottomheight + m) * pbottomwidth + l)\
+             * bottomchannels + n;
+    //  We add round_off before_s1 the int division and subtract round_off after it,
+    //  to ensure the formula matches ceil behavior:
+    int xmin = (l - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    int ymin = (m - 2*kernel_radius - max_displacement + round_off_s1 - 1)\
+     / stride1 + 1 - round_off;  //  ceil (l - 2*kernel_radius - max_displacement) / stride1
+    //  Same here:
+    int xmax = (l - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (l - max_displacement) / stride1
+    int ymax = (m - max_displacement + round_off_s1) / stride1 - round_off;
+    //  floor (m - max_displacement) / stride1
     Dtype sum = 0;
-    if(xmax>=0 && ymax>=0 && (xmin<=topwidth-1) && (ymin<=topheight-1))
-    {
-        xmin = max(0,xmin);
-        xmax = min(topwidth-1,xmax);
-
-        ymin = max(0,ymin);
-        ymax = min(topheight-1,ymax);
-
-        for(int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
-          for(int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
-
-            // Get bottom1 data:
+    if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth-1) && (ymin <= topheight-1)) {
+        xmin = max(0, xmin);
+        xmax = min(topwidth-1, xmax);
+        ymin = max(0, ymin);
+        ymax = min(topheight-1, ymax);
+        for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+          for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
+            //  Get bottom1 data:
             int s2o = stride2 * o;
             int s2p = stride2 * p;
-            int idxbot = ((item * pbottomheight + (m+s2p)) * pbottomwidth + (l+s2o)) * bottomchannels + n;
-            Dtype bot0tmp = bottom0[idxbot]; // bottom0[l+s2o,m+s2p,n]
-            Dtype bot1tmp = bottom1[idxbot]; // bottom1[l+s2o,m+s2p,n]
+            int idxbot1 = ((item * pbottomheight + (m+s2p)) * pbottomwidth\
+             + (l+s2o)) * bottomchannels + n;
+            Dtype bot0tmp = bottom0[idxbot0];
+            Dtype bot1tmp = bottom1[idxbot1];
             Dtype sign = (bot0tmp >= bot1tmp) ? Dtype(1.0) : Dtype(-1.0);
-
-            // Index offset for topdiff in following loops:
-            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width + (o+neighborhood_grid_radius); // index [o,p]
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width\
+             + (o + neighborhood_grid_radius);  //  index [o,p]
             int idxopoffset = (item * topchannels + op);
-
-            for(int y = ymin; y <= ymax; y++) {
-              for(int x = xmin; x <= xmax; x++) {
-                int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x; // topdiff[x,y,o,p]
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxopoffset * topheight + y) * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * sign;
               }
             }
           }
         }
     }
-    const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
-    bottom0diff[index + item*bottomcount] = sum / (float)sumelems;
+    const int sumelems = (kernel_radius * 2 + 1) * (kernel_radius * 2+1) * bottomchannels;
+    const int bot0index = ((n * bottomheight) + (m-pad_size)) * bottomwidth + (l-pad_size);
+    bottom0diff[bot0index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
-
 }
-
-
-// == Correlation Backward Pass Kernel (For Blob 1)
-template <typename Dtype> 
-__global__ void CorrelateDataBackward1Subtract(const int nthreads, int num, int item, int topwidth, int topheight, int topchannels,
-  int max_displacement, int neighborhood_grid_radius, int neighborhood_grid_width, int kernel_radius, int stride1, int stride2,
-  int bottomwidth, int bottomheight, int pbottomwidth, int pbottomheight, int bottomchannels, int bottomcount, int pad_size,
-  const Dtype *bottom0, const Dtype *bottom1, Dtype *bottom1diff, const Dtype *topdiff) 
-{
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    int l = index % bottomwidth + pad_size; //w-pos
-    int m = (index / bottomwidth) % bottomheight + pad_size; //h-pos
-    int n = (index / bottomwidth / bottomheight) % bottomchannels; //channels
-    
-    // round_off is a trick to enable integer division with ceil, even for negative numbers
-    // We use a large offset, for the inner part not to become negative.
+//  == Correlation Backward Pass Kernel (For Blob 1)
+template <typename Dtype>
+__global__ void CorrelateDataBackward1Subtract(const int nthreads, int num,
+  int item, int topwidth, int topheight, int topchannels,
+  int max_displacement, int neighborhood_grid_radius,
+  int neighborhood_grid_width, int kernel_radius,
+  int stride1, int stride2, int bottomwidth, int bottomheight,
+  int pbottomwidth, int pbottomheight, int bottomchannels,
+  int bottomcount, int pad_size, const Dtype *bottom0,
+  const Dtype *bottom1, Dtype *bottom1diff, const Dtype *topdiff) {
+    CUDA_KERNEL_LOOP(index, nthreads) {
+    //  int l = index % bottomwidth + pad_size; //w-pos
+    //  int m = (index / bottomwidth) % bottomheight + pad_size; //  h-pos
+    //  int n = (index / bottomwidth / bottomheight) % bottomchannels; //  channels
+    int n = index % bottomchannels;  //  channels
+    int l = (index / bottomchannels) % bottomwidth + pad_size;  //  w-pos
+    int m = (index / bottomchannels / bottomwidth) % bottomheight + pad_size;  //  h-pos
+    //  round_off is a trick to enable integer division with ceil, even for negative numbers
+    //  We use a large offset, for the inner part not to become negative.
     const int round_off = ROUND_OFF;
     const int round_off_s1 = stride1 * round_off;
-    
     Dtype sum = 0;
-    for(int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
-      for(int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
-        
+    int idxbot1 = ((item * pbottomheight + m) * pbottomwidth + l)\
+             * bottomchannels + n;
+    for (int p = -neighborhood_grid_radius; p <= neighborhood_grid_radius; p++) {
+      for (int o = -neighborhood_grid_radius; o <= neighborhood_grid_radius; o++) {
         int s2o = stride2 * o;
         int s2p = stride2 * p;
-       
-        int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1) / stride1 + 1 - round_off; 
-        int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1) / stride1 + 1 - round_off; 
-   
-        int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off; 
-        int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off; 
-        if((xmax>=0) && (ymax>=0) && (xmin<=topwidth-1) && (ymin<=topheight-1))
-        {
-            xmin = max(0,xmin);
-            xmax = min(topwidth-1,xmax);
-
-            ymin = max(0,ymin);
-            ymax = min(topheight-1,ymax);
-
-            // Get bottom0 data:
-            int idxbot = ((item * pbottomheight + (m-s2p)) * pbottomwidth + (l-s2o)) * bottomchannels + n;
-            // bottom0[l+s2o,m+s2p,n]
-            Dtype bot0tmp = bottom0[idxbot]; 
-            Dtype bot1tmp = bottom1[idxbot];
+        //  Get X,Y ranges and clamp
+        //  We add round_off before_s1 the int division and subtract round_off after it,
+        //  to ensure the formula matches ceil behavior:
+        int xmin = (l - 2*kernel_radius - max_displacement - s2o + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+         // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        int ymin = (m - 2*kernel_radius - max_displacement - s2p + round_off_s1 - 1)\
+         / stride1 + 1 - round_off;
+        // ceil (l - 2*kernel_radius - max_displacement - s2o) / stride1
+        //  Same here:
+        int xmax = (l - max_displacement - s2o + round_off_s1) / stride1 - round_off;
+        //  floor (l - max_displacement - s2o) / stride1
+        int ymax = (m - max_displacement - s2p + round_off_s1) / stride1 - round_off;
+        //  floor (m - max_displacement - s2p) / stride1
+        if (xmax >= 0 && ymax >= 0 && (xmin <= topwidth - 1) && (ymin <= topheight - 1)) {
+            xmin = max(0, xmin);
+            xmax = min(topwidth-1, xmax);
+            ymin = max(0, ymin);
+            ymax = min(topheight-1, ymax);
+            //  Get bottom0 data:
+            int idxbot0 = ((item * pbottomheight + (m - s2p)) * pbottomwidth + (l - s2o))\
+             * bottomchannels + n;
+            //  bottom0[l+s2o,m+s2p,n]
+            Dtype bot0tmp = bottom0[idxbot0];
+            Dtype bot1tmp = bottom1[idxbot1];
             Dtype sign = (bot0tmp >= bot1tmp) ? Dtype(-1.0) : Dtype(1.0);
-
-            // Index offset for topdiff in following loops:
-            int op = (p+neighborhood_grid_radius) * neighborhood_grid_width + (o+neighborhood_grid_radius); // index [o,p]
+            //  Index offset for topdiff in following loops:
+            int op = (p+neighborhood_grid_radius) * \
+            neighborhood_grid_width + (o+neighborhood_grid_radius);  //  index [o,p]
             int idxOpOffset = (item * topchannels + op);
-
-            for(int y = ymin; y <= ymax; y++) {
-              for(int x = xmin; x <= xmax; x++) {
-                int idxtopdiff = (idxOpOffset * topheight + y) * topwidth + x; // topdiff[x,y,o,p]
+            for (int y = ymin; y <= ymax; y++) {
+              for (int x = xmin; x <= xmax; x++) {
+                int idxtopdiff = (idxOpOffset * topheight + y)\
+                 * topwidth + x;  //  topdiff[x,y,o,p]
                 sum += topdiff[idxtopdiff] * sign;
               }
             }
@@ -401,45 +388,40 @@ __global__ void CorrelateDataBackward1Subtract(const int nthreads, int num, int
       }
     }
     const int sumelems = (kernel_radius*2+1)*(kernel_radius*2+1)*bottomchannels;
-    bottom1diff[index + item*bottomcount] = sum / (float)sumelems;
+    const int bot1index = ((n * bottomheight) + (m - pad_size)) * bottomwidth + (l - pad_size);
+    bottom1diff[bot1index + item * bottomcount] = sum / static_cast<float>(sumelems);
   }
-
 }
-// == Forward 
-// == Dimension rearrangement Kernel
-  
+//  == Forward
+//  == Dimension rearrangement Kernel
 template <typename Dtype>
-__global__ void blob_rearrange_kernel2(const Dtype* in, Dtype* out, int num, int channels, int width, int height, int widthheight, int padding, int pwidthheight)
-{   
-    // change shape from [batchsize,channel,y,x] to [batchsize,y,x,channel]
-    int xy = blockIdx.x*blockDim.x + threadIdx.x;
-    if(xy>=widthheight)
+__global__ void blob_rearrange_kernel2(const Dtype* in, Dtype* out, int num,
+int channels, int width, int height, int widthheight, int padding, int pwidthheight) {
+    //  change shape from [batchsize,channel,y,x] to [batchsize,y,x,channel]
+    int xy = blockIdx.x * blockDim.x + threadIdx.x;
+    if (xy >= widthheight )
         return;
-
     int ch = blockIdx.y;
     int n  = blockIdx.z;
-
-    Dtype value=in[(n*channels+ch)*widthheight+xy];
-
+    Dtype value = in[(n * channels + ch) * widthheight + xy];
     __syncthreads();
-
     int xpad  = (xy % width + padding);
     int ypad  = (xy / width + padding);
-    int xypad = ypad * (width+2*padding) + xpad;
-
-    out[(n*pwidthheight+xypad)*channels + ch] = value;
-} 
-template <typename Dtype> 
+    int xypad = ypad * (width + 2 * padding) + xpad;
+    out[(n * pwidthheight + xypad) * channels + ch] = value;
+}
+template <typename Dtype>
 void Forward_gpu(
       const Tensor<gpu, 4, Dtype> &out,
       const Tensor<gpu, 4, Dtype> &data1,
       const Tensor<gpu, 4, Dtype> &data2,
       const Tensor<gpu, 4, Dtype> &tmp1,
       const Tensor<gpu, 4, Dtype> &tmp2,
-      int top_channels_,int top_height_,int top_width_,int pad_size_,bool is_multiply,
-      int max_displacement_,int kernel_size_,int neighborhood_grid_radius_,int neighborhood_grid_width_,
-      int  kernel_radius_,int stride1_,int stride2_,cudaStream_t stream,cudaStream_t stream_tmp1,cudaStream_t stream_tmp2)
-{   
+      int top_channels_, int top_height_, int top_width_, int pad_size_,
+      bool is_multiply, int max_displacement_, int kernel_size_,
+      int neighborhood_grid_radius_, int neighborhood_grid_width_,
+      int  kernel_radius_, int stride1_, int stride2_, cudaStream_t stream,
+      cudaStream_t stream_tmp1, cudaStream_t stream_tmp2) {
     const Dtype *bottom_data1 = data1.dptr_;
     const Dtype *bottom_data2 = data2.dptr_;
     Dtype *rbot1 = tmp1.dptr_;
@@ -451,55 +433,49 @@ void Forward_gpu(
     const int bwidth = data1.size(3);
     const int bwidthheight = bwidth * bheight;
     const int topcount = top_width_ * top_height_ * top_channels_;
-    
-    dim3 threadsPerBlock(THREADS_PER_WARP * WARPS_PER_BLOCK); 
-    int threads_per_block=16;
-    dim3 totalBlocksRearr((bwidthheight-1)/threads_per_block+1, bchannels, bnum);
+    dim3 threadsPerBlock(THREADS_PER_WARP * WARPS_PER_BLOCK);
+    int threads_per_block = 16;
+    dim3 totalBlocksRearr((bwidthheight - 1) / threads_per_block + 1, bchannels, bnum);
     const int pwidthheight = (bwidth + 2 * pad_size_) * (bheight + 2 * pad_size_);
-    
-    blob_rearrange_kernel2<Dtype><<<totalBlocksRearr,threads_per_block,0,stream_tmp1>>>(bottom_data1,rbot1,bnum,bchannels,bwidth,bheight,bwidthheight,pad_size_,pwidthheight);
-    blob_rearrange_kernel2<Dtype><<<totalBlocksRearr,threads_per_block,0,stream_tmp2>>>(bottom_data2,rbot2,bnum,bchannels,bwidth,bheight,bwidthheight,pad_size_,pwidthheight);
-    
+    blob_rearrange_kernel2<Dtype><<<totalBlocksRearr, threads_per_block, 0, stream_tmp1>>>
+    (bottom_data1, rbot1, bnum, bchannels, bwidth, bheight, bwidthheight, pad_size_, pwidthheight);
+    blob_rearrange_kernel2<Dtype><<<totalBlocksRearr, threads_per_block, 0, stream_tmp2>>>
+    (bottom_data2, rbot2, bnum, bchannels, bwidth, bheight, bwidthheight, pad_size_, pwidthheight);
     const int num = bnum;
     const int channels = bchannels;
-    const int height = bheight + 2*pad_size_;
-    const int width = bwidth + 2*pad_size_;
-    
-    const int shared_memory_per_block = (kernel_size_*kernel_size_)*bchannels;
-
-    if(is_multiply == true) {
-        // CorrelationLayer 
+    const int height = bheight + 2 * pad_size_;
+    const int width = bwidth + 2 * pad_size_;
+    const int shared_memory_per_block = (kernel_size_ * kernel_size_) * bchannels;
+    if (is_multiply == true) {
+        //  CorrelationLayer
         int topThreadCount = topcount;
-        
         dim3 totalBlocksCorr(top_width_, top_height_, num);
-        
-        CorrelateData<Dtype><<<totalBlocksCorr, threadsPerBlock, shared_memory_per_block * sizeof(Dtype),stream>>>(
+        CorrelateData<Dtype><<<totalBlocksCorr, threadsPerBlock,
+        shared_memory_per_block * sizeof(Dtype), stream>>>(
             topThreadCount,
             num, top_width_, top_height_, top_channels_, topcount,
-            max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_, kernel_size_,
+            max_displacement_, neighborhood_grid_radius_,
+            neighborhood_grid_width_, kernel_radius_, kernel_size_,
             stride1_, stride2_,
             width, height, channels,
-            rbot1, rbot2, top
-            );
+            rbot1, rbot2, top);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
-        
     } else {
-        // CorrelationLayer
-        for(int n = 0; n < num; n++) {
-            
+        //  CorrelationLayer
+        for (int n = 0; n < num; n++) {
             int topThreadCount = topcount;
-            const int gridSize = (topThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
-            CorrelateDataSubtract<Dtype><<<gridSize, kMaxThreadsPerBlock,0,stream>>>(
+            const int gridSize = (topThreadCount + kMaxThreadsPerBlock - 1)\
+             / kMaxThreadsPerBlock;
+            CorrelateDataSubtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream>>>(
                 topThreadCount,
                 num, n, top_width_, top_height_, top_channels_, topcount,
-                max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
-                stride1_, stride2_,width, height, channels,rbot1, rbot2, top );
-       
+                max_displacement_, neighborhood_grid_radius_,
+                neighborhood_grid_width_, kernel_radius_,
+                stride1_, stride2_, width, height, channels, rbot1, rbot2, top);
          CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
     }
 }
-
 template <typename Dtype>
 void Backward_gpu(
        const Tensor<gpu, 4, Dtype> &out_grad,
@@ -507,134 +483,124 @@ void Backward_gpu(
       const Tensor<gpu, 4, Dtype> &in_grad2,
       const Tensor<gpu, 4, Dtype> &tmp1,
       const Tensor<gpu, 4, Dtype> &tmp2,
-      int top_channels_,int top_height_,int top_width_,int pad_size_,bool is_multiply,
-      int max_displacement_,int kernel_size_,int neighborhood_grid_radius_,int neighborhood_grid_width_,
-      int  kernel_radius_,int stride1_,int stride2_,cudaStream_t stream0,cudaStream_t stream1,int num,int channels,int height,int width)
-{
-
-    // Get top diff, compute bottom diff
+      int top_channels_, int top_height_,
+      int top_width_, int pad_size_, bool is_multiply,
+      int max_displacement_, int kernel_size_,
+      int neighborhood_grid_radius_, int neighborhood_grid_width_,
+      int  kernel_radius_, int stride1_, int stride2_,
+      cudaStream_t stream0, cudaStream_t stream1,
+      int num, int channels, int height, int width) {
+    //  Get top diff, compute bottom diff
     const Dtype* top_diff = out_grad.dptr_;
-   
     Dtype* bottom0_diff = in_grad1.dptr_;
     Dtype* bottom1_diff = in_grad2.dptr_;
-
     const Dtype* rbot1 = tmp1.dptr_;
     const Dtype* rbot2 = tmp2.dptr_;
-
-    const int paddedheight = height + 2*pad_size_;
-    const int paddedwidth = width + 2*pad_size_;
-
+    const int paddedheight = height + 2 * pad_size_;
+    const int paddedwidth = width + 2 * pad_size_;
     const int bottomcount = channels * height * width;
     int botThreadCount = bottomcount;
     const int gridSize = (botThreadCount + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
-   
-    // CorrelationLayerBackward
-    
-    if(is_multiply == true) {
-        
-        // == Run kernel Backward 0
-        dim3 totalBlocksBackward0(width, height, channels * num); //First dim is fastest
-        const int buffer_size_backw0 = ((int)ceil((float)(2 * kernel_radius_) / (float)stride1_) + 1) * top_channels_;
-       
-        // == Run kernel Backward 0 
-        for(int n = 0; n < num; n++) {
-        
-        CorrelateDataBackward0<Dtype><<<gridSize, kMaxThreadsPerBlock,0,stream0>>>(
+    //  CorrelationLayerBackward
+    if (is_multiply == true) {
+        //  == Run kernel Backward 0
+        dim3 totalBlocksBackward0(width, height, channels * num);  //  First dim is fastest
+        const int buffer_size_backw0 = \
+        (static_cast<int>(ceil(static_cast<float>(2 * kernel_radius_)\
+         / static_cast<float>(stride1_))) + 1) * top_channels_;
+        //  == Run kernel Backward 0
+        for (int n = 0; n < num; n++) {
+        CorrelateDataBackward0<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream0>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
-            bottom0_diff, rbot2, top_diff
-            ); 
-    
+            bottom0_diff, rbot2, top_diff);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
-        
-        // == Run kernel Backward 1
-        for(int n = 0; n < num; n++) {
-        CorrelateDataBackward1<Dtype><<<gridSize, kMaxThreadsPerBlock,0,stream1>>>(
+        //  == Run kernel Backward 1
+        for (int n = 0; n < num; n++) {
+        CorrelateDataBackward1<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream1>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
-            rbot1, bottom1_diff, top_diff
-            );   
-    
+            rbot1, bottom1_diff, top_diff);
        CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
-        
     } else  {
-        for(int n = 0; n < num; n++) {
-        //Bottom0:
-        CorrelateDataBackward0Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock,0,stream0>>>(
+        for (int n = 0; n < num; n++) {
+        //  Bottom0:
+        CorrelateDataBackward0Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream0>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
-            bottom0_diff, rbot1, rbot2, top_diff
-            );
-    
+            bottom0_diff, rbot1, rbot2, top_diff);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
-
-        for(int n = 0; n < num; n++) {
-        //Bottom1:
-        CorrelateDataBackward1Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock,0,stream1>>>(
+        for (int n = 0; n < num; n++) {
+        //  Bottom1:
+        CorrelateDataBackward1Subtract<Dtype><<<gridSize, kMaxThreadsPerBlock, 0, stream1>>>(
             botThreadCount,
             num, n, top_width_, top_height_, top_channels_,
             max_displacement_, neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
             stride1_, stride2_,
             width, height, paddedwidth, paddedheight, channels, bottomcount, pad_size_,
-            rbot1, rbot2, bottom1_diff, top_diff
-            );
+            rbot1, rbot2, bottom1_diff, top_diff);
         CORRELATION_CUDA_CHECK(cudaPeekAtLastError());
         }
     }
 }
-
 }  // namespace cuda
-
 template<typename Dtype>
-inline void CorrelationForward( const Tensor<gpu, 4, Dtype> &out,
-                                const Tensor<gpu, 4, Dtype> &data1,
-                                const Tensor<gpu, 4, Dtype> &data2,
-                                const Tensor<gpu, 4, Dtype> &tmp1,
-                                const Tensor<gpu, 4, Dtype> &tmp2,
-                                int top_channels_,int top_height_,int top_width_,int pad_size_,bool is_multiply,
-                                int max_displacement_,int kernel_size_,int neighborhood_grid_radius_,int neighborhood_grid_width_,
-                                int  kernel_radius_,int stride1_,int stride2_
+inline void CorrelationForward(const Tensor<gpu, 4, Dtype> &out,
+                               const Tensor<gpu, 4, Dtype> &data1,
+                               const Tensor<gpu, 4, Dtype> &data2,
+                               const Tensor<gpu, 4, Dtype> &tmp1,
+                               const Tensor<gpu, 4, Dtype> &tmp2,
+                               int top_channels_, int top_height_,
+                               int top_width_, int pad_size_, bool is_multiply,
+                               int max_displacement_, int kernel_size_,
+                               int neighborhood_grid_radius_, int neighborhood_grid_width_,
+                               int kernel_radius_, int stride1_, int stride2_
                            ) {
-  cudaStream_t stream = Stream<gpu>::GetStream(out.stream_); 
+  cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
   cudaStream_t stream_tmp1 = Stream<gpu>::GetStream(tmp1.stream_);
   cudaStream_t stream_tmp2 = Stream<gpu>::GetStream(tmp2.stream_);
-  cuda::Forward_gpu(out, data1, data2,  tmp1,tmp2,top_channels_,top_height_,top_width_,pad_size_,is_multiply,max_displacement_,kernel_size_,
-                           neighborhood_grid_radius_,neighborhood_grid_width_,kernel_radius_,stride1_,stride2_,stream,stream_tmp1,stream_tmp2);
+  cuda::Forward_gpu(out, data1, data2, tmp1, tmp2, top_channels_, top_height_,
+                    top_width_, pad_size_, is_multiply, max_displacement_, kernel_size_,
+                    neighborhood_grid_radius_, neighborhood_grid_width_, kernel_radius_,
+                    stride1_, stride2_, stream, stream_tmp1, stream_tmp2);
 }
 
 template<typename Dtype>
 inline void CorrelationBackward(const Tensor<gpu, 4, Dtype> &out_grad,
                             const Tensor<gpu, 4, Dtype> &in_grad1,
                             const Tensor<gpu, 4, Dtype> &in_grad2,
-                            const Tensor<gpu, 4, Dtype> &tmp1, 
+                            const Tensor<gpu, 4, Dtype> &tmp1,
                             const Tensor<gpu, 4, Dtype> &tmp2,
-                            int top_channels_,int top_height_,int top_width_,int pad_size_,bool is_multiply,
-                            int max_displacement_,int kernel_size_,int neighborhood_grid_radius_,int neighborhood_grid_width_,
-                            int  kernel_radius_,int stride1_,int stride2_,int num,int channels,int height,int width
+                            int top_channels_, int top_height_,
+                            int top_width_, int pad_size_, bool is_multiply,
+                            int max_displacement_, int kernel_size_,
+                            int neighborhood_grid_radius_, int neighborhood_grid_width_,
+                            int  kernel_radius_, int stride1_,
+                            int stride2_, int num, int channels, int height, int width
                             ) {
   cudaStream_t stream0 = Stream<gpu>::GetStream(in_grad1.stream_);
   cudaStream_t stream1 = Stream<gpu>::GetStream(in_grad2.stream_);
-  cuda::Backward_gpu( out_grad,in_grad1,in_grad2,tmp1,tmp2,top_channels_,top_height_,top_width_,pad_size_,is_multiply,
-                      max_displacement_,kernel_size_,neighborhood_grid_radius_,neighborhood_grid_width_,kernel_radius_,stride1_,stride2_,stream0,stream1,num,channels,height, width);
+  cuda::Backward_gpu(out_grad, in_grad1, in_grad2, tmp1, tmp2, top_channels_,
+                      top_height_, top_width_, pad_size_, is_multiply,
+                      max_displacement_, kernel_size_, neighborhood_grid_radius_,
+                      neighborhood_grid_width_, kernel_radius_, stride1_, stride2_,
+                      stream0, stream1, num, channels, height, width);
 }
-
 }  // namespace mshadow
- 
 namespace mxnet {
 namespace op {
-
 template<>
 Operator* CreateOp<gpu>(CorrelationParam param) {
   return new CorrelationOp<gpu>(param);
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
index b98b4e85df52..c4f9afaafeef 100644
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -89,7 +89,6 @@ class CuDNNBatchNormOp : public Operator {
     Tensor<gpu, 4> x = in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
     Tensor<gpu, 1> gamma =
       in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    if (param_.fix_gamma) gamma = 1.0f;
     Tensor<gpu, 1> beta =
       in_data[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
     Tensor<gpu, 4> y = out_data[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, real_t>(shape_, s);
@@ -99,6 +98,9 @@ class CuDNNBatchNormOp : public Operator {
       aux_states[cudnnbatchnorm::kMovingInvVar]
       .get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
     float a = 1.0f, b = 0.0f;
+
+    if (ctx.is_train && param_.fix_gamma) gamma = 1.f;
+
     if (ctx.is_train) {
       Tensor<gpu, 1> save_mean =
         out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
@@ -212,7 +214,7 @@ class CuDNNBatchNormOp : public Operator {
                                              save_mean.dptr_,
                                              save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
 #endif
-    if (param_.fix_gamma) dgamma = 0;
+    if (param_.fix_gamma) dgamma = 0.f;
   }
 
  private:
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index a7f321aeb4d4..9f05c860c286 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -10,19 +10,38 @@
 #include <algorithm>
 #include <vector>
 #include "./convolution-inl.h"
+#include "../common/cuda_utils.h"
+
 namespace mxnet {
 namespace op {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1
+#if MXNET_USE_CUDNN == 1
+void TuneCudnnConvolution(ConvolutionParam param,
+                          std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape,
+                          Context ctx,
+                          cudnnDataType_t dtype,
+                          cudnnConvolutionFwdAlgo_t *algo,
+                          cudnnConvolutionBwdDataAlgo_t *back_algo,
+                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w);
+
 template<typename DType>
 class CuDNNConvolutionOp : public Operator {
  public:
-  explicit CuDNNConvolutionOp(ConvolutionParam param) {
+  explicit CuDNNConvolutionOp(ConvolutionParam param,
+                              std::vector<TShape> *in_shape,
+                              std::vector<TShape> *out_shape,
+                              Context ctx) {
+    using namespace mshadow;
     this->param_ = param;
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
     init_cudnn_ = false;
-    // TODO(xxx): fp16
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+
+    if (param.cudnn_tune != conv::kOff) {
+      TuneCudnnConvolution(param, in_shape, out_shape, ctx, dtype_,
+                           &algo_, &back_algo_, &back_algo_w_);
+    }
   }
 
   ~CuDNNConvolutionOp() {
@@ -42,50 +61,69 @@ class CuDNNConvolutionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
+    DType *data_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *out_ptr = NULL;
     CHECK_EQ(in_data.size(), expected);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(wmat.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
     if (!init_cudnn_) {
       Init(s, in_data, out_data);
     }
     Tensor<gpu, 1, DType> workspace =
         ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
                                  mshadow::Shape1(forward_workspace_), s);
+
+    if (param_.kernel.ndim() == 2) {
+      Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(wmat.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+      wmat_ptr = wmat.dptr_;
+      out_ptr = out.dptr_;
+    } else {
+      Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> wmat = in_data[conv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data[conv::kOut].get<gpu, 5, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(wmat.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+      wmat_ptr = wmat.dptr_;
+      out_ptr = out.dptr_;
+    }
     for (uint32_t g = 0; g < param_.num_group; ++g) {
       typename DataType<DType>::ScaleType alpha = 1.0f;
       typename DataType<DType>::ScaleType beta = 0.0f;
       CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
                                        &alpha,
                                        in_desc_,
-                                       data.dptr_ + data_offset_ * g,
+                                       data_ptr + data_offset_ * g,
                                        filter_desc_,
-                                       wmat.dptr_ + weight_offset_ * g,
+                                       wmat_ptr + weight_offset_ * g,
                                        conv_desc_,
                                        algo_,
                                        workspace.dptr_,
                                        forward_workspace_byte_,
                                        &beta,
                                        out_desc_,
-                                       out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+                                       out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
       if (!param_.no_bias) {
         beta = 1.0f;
         Tensor<gpu, 1, DType> bias = in_data[conv::kBias].get<gpu, 1, DType>(s);
-#if CUDNN_MAJOR >= 4
+        #if CUDNN_MAJOR >= 4
         CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
-                                &alpha,
-                                bias_desc_,
-                                bias.dptr_ + bias_offset_ * g,
-                                &beta,
-                                out_desc_,
-                                out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
-#endif
-#if CUDNN_MAJOR == 3
+                                  &alpha,
+                                  bias_desc_,
+                                  bias.dptr_ + bias_offset_ * g,
+                                  &beta,
+                                  out_desc_,
+                                  out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+        #endif
+        #if CUDNN_MAJOR == 3
         CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
                                 CUDNN_ADD_SAME_C,
                                 &alpha,
@@ -93,8 +131,8 @@ class CuDNNConvolutionOp : public Operator {
                                 bias.dptr_ + bias_offset_ * g,
                                 &beta,
                                 out_desc_,
-                                out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
-#endif
+                                out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+        #endif
       }
     }
   }
@@ -109,14 +147,37 @@ class CuDNNConvolutionOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
+    DType *grad_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *gwmat_ptr = NULL;
+    DType *data_ptr = NULL;
+    DType *gdata_ptr = NULL;
     CHECK_EQ(out_grad.size(), 1);
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4, DType> grad = out_grad[conv::kOut].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> gwmat = in_grad[conv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> gdata = in_grad[conv::kData].get<gpu, 4, DType>(s);
+    if (param_.kernel.ndim() == 2) {
+      Tensor<gpu, 4, DType> grad = out_grad[conv::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> wmat = in_data[conv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> gwmat = in_grad[conv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> gdata = in_grad[conv::kData].get<gpu, 4, DType>(s);
+      grad_ptr = grad.dptr_;
+      wmat_ptr = wmat.dptr_;
+      gwmat_ptr = gwmat.dptr_;
+      data_ptr = data.dptr_;
+      gdata_ptr = gdata.dptr_;
+    } else {
+      Tensor<gpu, 5, DType> grad = out_grad[conv::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> wmat = in_data[conv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> gwmat = in_grad[conv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> gdata = in_grad[conv::kData].get<gpu, 5, DType>(s);
+      grad_ptr = grad.dptr_;
+      wmat_ptr = wmat.dptr_;
+      gwmat_ptr = gwmat.dptr_;
+      data_ptr = data.dptr_;
+      gdata_ptr = gdata.dptr_;
+    }
     Tensor<gpu, 1, DType> workspace =
       ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
       mshadow::Shape1(backward_workspace_), s);
@@ -129,7 +190,7 @@ class CuDNNConvolutionOp : public Operator {
         CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
                                               &alpha,
                                               out_desc_,
-                                              grad.dptr_ + out_offset_ * g,
+                                              grad_ptr + out_offset_ * g,
                                               req[conv::kBias] == kWriteTo ? &beta : &beta_add,
                                               bias_desc_,
                                               gbias.dptr_ + bias_offset_ * g),
@@ -139,59 +200,59 @@ class CuDNNConvolutionOp : public Operator {
       CHECK_EQ(cudnnConvolutionBackwardFilter_v3(s->dnn_handle_,
                &alpha,
                in_desc_,
-               data.dptr_ + data_offset_ * g,
+               data_ptr + data_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_w_,
                workspace.dptr_,
                backward_workspace_byte_,
                req[conv::kWeight] == kWriteTo? &beta : &beta_add,
                filter_desc_,
-               gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gwmat_ptr + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
       #elif CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnConvolutionBackwardFilter(s->dnn_handle_,
+      CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
                &alpha,
                in_desc_,
-               data.dptr_ + data_offset_ * g,
+               data_ptr + data_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_w_,
                workspace.dptr_,
                backward_workspace_byte_,
                req[conv::kWeight] == kWriteTo? &beta : &beta_add,
                filter_desc_,
-               gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gwmat_ptr + weight_offset_ * g));
       #endif
       #if CUDNN_MAJOR <= 4
       CHECK_EQ(cudnnConvolutionBackwardData_v3(s->dnn_handle_,
                &alpha,
                filter_desc_,
-               wmat.dptr_ + weight_offset_ * g,
+               wmat_ptr + weight_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_,
                workspace.dptr_,
                backward_workspace_byte_,
                &beta,
                in_desc_,
-               gdata.dptr_ + data_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gdata_ptr + data_offset_ * g), CUDNN_STATUS_SUCCESS);
       #elif CUDNN_MAJOR == 5
       CHECK_EQ(cudnnConvolutionBackwardData(s->dnn_handle_,
                &alpha,
                filter_desc_,
-               wmat.dptr_ + weight_offset_ * g,
+               wmat_ptr + weight_offset_ * g,
                out_desc_,
-               grad.dptr_ + out_offset_ * g,
+               grad_ptr + out_offset_ * g,
                conv_desc_,
                back_algo_,
                workspace.dptr_,
                backward_workspace_byte_,
                &beta,
                in_desc_,
-               gdata.dptr_ + data_offset_ * g), CUDNN_STATUS_SUCCESS);
+               gdata_ptr + data_offset_ * g), CUDNN_STATUS_SUCCESS);
       #endif
     }
   }
@@ -212,97 +273,188 @@ class CuDNNConvolutionOp : public Operator {
       size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
       size_t back_size = 0;
       size_t back_size_w = 0;
-      Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
-      data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
-      out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
-      weight_offset_ = param_.num_filter / param_.num_group * data.shape_[1] / param_.num_group
-                       * param_.kernel[0] * param_.kernel[1];
       CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc_), CUDNN_STATUS_SUCCESS);
       CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc_), CUDNN_STATUS_SUCCESS);
-      #if CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
-                                          dtype_,
-                                          format_,
-                                          param_.num_filter / param_.num_group,
-                                          data.shape_[1] / param_.num_group,
-                                          param_.kernel[0],
-                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
-      #else
-      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
-                                          dtype_,
-                                          param_.num_filter / param_.num_group,
-                                          data.shape_[1] / param_.num_group,
-                                          param_.kernel[0],
-                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
-      #endif
-      CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
-                                               param_.pad[0],
-                                               param_.pad[1],
-                                               param_.stride[0],
-                                               param_.stride[1],
-                                               1,
-                                               1,
-                                               CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc_,
+      if (param_.kernel.ndim() == 2) {
+        // 2d conv
+        Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
+        Tensor<gpu, 4, DType> out = out_data[conv::kOut].get<gpu, 4, DType>(s);
+        data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
+        out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
+        weight_offset_ = param_.num_filter / param_.num_group * data.shape_[1] / param_.num_group
+                        * param_.kernel[0] * param_.kernel[1];
+        #if CUDNN_MAJOR == 5
+        CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
+                                            dtype_,
+                                            format_,
+                                            param_.num_filter / param_.num_group,
+                                            data.shape_[1] / param_.num_group,
+                                            param_.kernel[0],
+                                            param_.kernel[1]), CUDNN_STATUS_SUCCESS);
+        #else
+        CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
                                             dtype_,
-                                            data.shape_[0],
+                                            param_.num_filter / param_.num_group,
                                             data.shape_[1] / param_.num_group,
-                                            data.shape_[2],
-                                            data.shape_[3],
-                                            data.shape_[1] * data.shape_[2] * data.shape_[3],
-                                            data.shape_[2] * data.shape_[3],
-                                            data.shape_[3],
-                                            1), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc_,
+                                            param_.kernel[0],
+                                            param_.kernel[1]), CUDNN_STATUS_SUCCESS);
+        #endif
+        CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
+                                                param_.pad[0],
+                                                param_.pad[1],
+                                                param_.stride[0],
+                                                param_.stride[1],
+                                                1,
+                                                1,
+                                                CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc_,
+                                              dtype_,
+                                              data.shape_[0],
+                                              data.shape_[1] / param_.num_group,
+                                              data.shape_[2],
+                                              data.shape_[3],
+                                              data.shape_[1] * data.shape_[2] * data.shape_[3],
+                                              data.shape_[2] * data.shape_[3],
+                                              data.shape_[3],
+                                              1), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc_,
+                                              dtype_,
+                                              out.shape_[0],
+                                              out.shape_[1] / param_.num_group,
+                                              out.shape_[2],
+                                              out.shape_[3],
+                                              out.shape_[1] * out.shape_[2] * out.shape_[3],
+                                              out.shape_[2] * out.shape_[3],
+                                              out.shape_[3],
+                                              1), CUDNN_STATUS_SUCCESS);
+      } else if (param_.kernel.ndim() == 3) {
+        // 3d conv
+        Tensor<gpu, 5, DType> data = in_data[conv::kData].get<gpu, 5, DType>(s);
+        Tensor<gpu, 5, DType> out = out_data[conv::kOut].get<gpu, 5, DType>(s);
+        data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * \
+                                                           data.shape_[3] * \
+                                                           data.shape_[4];
+        out_offset_ = out.shape_[1] / param_.num_group * out.shape_[2] * \
+                                                         out.shape_[3] * \
+                                                         out.shape_[4];
+        weight_offset_ = param_.num_filter / param_.num_group * data.shape_[1] / param_.num_group
+                        * param_.kernel[0] * param_.kernel[1] * param_.kernel[2];
+        std::vector<int> filter_vec = {static_cast<int>(param_.num_filter / param_.num_group),
+                                       static_cast<int>(data.shape_[1] / param_.num_group),
+                                       static_cast<int>(param_.kernel[0]),
+                                       static_cast<int>(param_.kernel[1]),
+                                       static_cast<int>(param_.kernel[2])};
+
+        std::vector<int> pad_vec = {static_cast<int>(param_.pad[0]),
+                                    static_cast<int>(param_.pad[1]),
+                                    static_cast<int>(param_.pad[2])};
+
+        std::vector<int> stride_vec = {static_cast<int>(param_.stride[0]),
+                                       static_cast<int>(param_.stride[1]),
+                                       static_cast<int>(param_.stride[2])};
+
+        std::vector<int> upscale_vec = {1, 1, 1};
+
+        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                   static_cast<int>(data.shape_[1]),
+                                   static_cast<int>(data.shape_[2]),
+                                   static_cast<int>(data.shape_[3]),
+                                   static_cast<int>(data.shape_[4])};
+
+        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[4]),
+                                    1};
+
+        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                   static_cast<int>(out.shape_[1]),
+                                   static_cast<int>(out.shape_[2]),
+                                   static_cast<int>(out.shape_[3]),
+                                   static_cast<int>(out.shape_[4])};
+
+        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[4]),
+                                    1};
+
+        #if CUDNN_MAJOR == 5
+        CHECK_EQ(cudnnSetFilterNdDescriptor(filter_desc_,
                                             dtype_,
-                                            out.shape_[0],
-                                            out.shape_[1] / param_.num_group,
-                                            out.shape_[2],
-                                            out.shape_[3],
-                                            out.shape_[1] * out.shape_[2] * out.shape_[3],
-                                            out.shape_[2] * out.shape_[3],
-                                            out.shape_[3],
-                                            1), CUDNN_STATUS_SUCCESS);
+                                            format_,
+                                            static_cast<int>(filter_vec.size()),
+                                            &filter_vec[0]), CUDNN_STATUS_SUCCESS);
+        #else
+        LOG(FATAL) << "Only support CUDNN V5 for 3D convolution";
+        #endif
+        CHECK_EQ(cudnnSetConvolutionNdDescriptor(conv_desc_,
+                                                 3,
+                                                 &pad_vec[0],
+                                                 &stride_vec[0],
+                                                 &upscale_vec[0],
+                                                 CUDNN_CROSS_CORRELATION,
+                                                 dtype_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc_,
+                                              dtype_,
+                                              static_cast<int>(ishape.size()),
+                                              &ishape[0],
+                                              &istride[0]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc_,
+                                              dtype_,
+                                              static_cast<int>(oshape.size()),
+                                              &oshape[0],
+                                              &ostride[0]), CUDNN_STATUS_SUCCESS);
+      }
       if (!param_.no_bias) {
         Tensor<gpu, 1, DType> bias = in_data[conv::kBias].get<gpu, 1, DType>(s);
         bias_offset_ = bias.shape_[0] / param_.num_group;
-        CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
-                                            CUDNN_TENSOR_NCHW,
+        std::vector<int> bias_shape = {1,
+                                       static_cast<int>(bias.shape_[0] / param_.num_group),
+                                       1, 1};
+        std::vector<int> bias_stride = {static_cast<int>(bias_offset_), 1, 1, 1};
+        if (param_.kernel.ndim() == 3) {
+          bias_shape.push_back(1);
+          bias_stride.push_back(1);
+        }
+        CHECK_EQ(cudnnSetTensorNdDescriptor(bias_desc_,
                                             dtype_,
-                                            1,
-                                            bias.shape_[0] / param_.num_group,
-                                            1,
-                                            1), CUDNN_STATUS_SUCCESS);
+                                            static_cast<int>(bias_shape.size()),
+                                            &bias_shape[0],
+                                            &bias_stride[0]), CUDNN_STATUS_SUCCESS);
       }
-      CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-      CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
-               in_desc_,
-               filter_desc_,
-               conv_desc_,
-               out_desc_,
-               CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &algo_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
-               in_desc_,
-               out_desc_,
-               conv_desc_,
-               filter_desc_,
-               CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &back_algo_w_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
-               filter_desc_,
-               out_desc_,
-               conv_desc_,
-               in_desc_,
-               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &back_algo_), CUDNN_STATUS_SUCCESS);
+
+      if (!param_.cudnn_tune) {
+        CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+        CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
+                 in_desc_,
+                 filter_desc_,
+                 conv_desc_,
+                 out_desc_,
+                 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &algo_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+                 in_desc_,
+                 out_desc_,
+                 conv_desc_,
+                 filter_desc_,
+                 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &back_algo_w_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+                 filter_desc_,
+                 out_desc_,
+                 conv_desc_,
+                 in_desc_,
+                 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &back_algo_), CUDNN_STATUS_SUCCESS);
+      }
+
       CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
                filter_desc_,
                out_desc_,
@@ -325,8 +477,14 @@ class CuDNNConvolutionOp : public Operator {
                out_desc_,
                algo_,
                &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
+
       forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
       backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
+      // ugly fix CUDNN algorithm selection
+      // safe to remove after CuDNN fix 3D conv selection
+      // if (param_.kernel.ndim() == 3) {
+      //   back_algo_w_ = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+      // }
     }
   }
 
diff --git a/src/operator/cudnn_convolution.cc b/src/operator/cudnn_convolution.cc
new file mode 100644
index 000000000000..b3d6b481b012
--- /dev/null
+++ b/src/operator/cudnn_convolution.cc
@@ -0,0 +1,295 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_convolution.cc
+ * \brief
+ * \author Junyuan Xie
+*/
+#include "./cudnn_convolution-inl.h"
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+
+#include <sstream>
+#include <mutex>
+#include <unordered_map>
+
+namespace mxnet {
+namespace op {
+#if MXNET_USE_CUDNN == 1
+namespace conv {
+struct CudnnAlgorithms {
+  cudnnConvolutionFwdAlgo_t fwd;
+  cudnnConvolutionBwdDataAlgo_t bwd;
+  cudnnConvolutionBwdFilterAlgo_t flt;
+};
+
+std::unordered_map<std::string, CudnnAlgorithms> g_cudnn_algo_reg;
+std::mutex g_reg_mutex;
+}  // namespace conv
+// TODO(xxx): Refactor with Init CuDNN function, remove redandent code in initalization
+void TuneCudnnConvolution(ConvolutionParam param,
+                          std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape,
+                          Context ctx,
+                          cudnnDataType_t dtype,
+                          cudnnConvolutionFwdAlgo_t *algo,
+                          cudnnConvolutionBwdDataAlgo_t *back_algo,
+                          cudnnConvolutionBwdFilterAlgo_t *back_algo_w) {
+  using namespace mshadow;
+  // convert MB to bytes
+
+  size_t expected = param.no_bias ? 2 : 3;
+#if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format = CUDNN_TENSOR_NCHW;
+#endif
+  CHECK_EQ(in_shape->size(), expected);
+  CHECK_EQ(out_shape->size(), 1);
+  TShape &x_shape = (*in_shape)[conv::kData];
+  TShape &w_shape = (*in_shape)[conv::kWeight];
+  TShape &y_shape = (*out_shape)[conv::kOut];
+  std::ostringstream oss;
+  oss << x_shape << ";" << y_shape << ";" << w_shape << ";" << param.workspace;
+  std::string key = oss.str();
+  std::unordered_map<std::string, conv::CudnnAlgorithms>::const_iterator iter =
+    conv::g_cudnn_algo_reg.find(key);
+  if (iter != conv::g_cudnn_algo_reg.end()) {
+    *algo = iter->second.fwd;
+    *back_algo = iter->second.bwd;
+    *back_algo_w = iter->second.flt;
+    return;
+  }
+
+  size_t workspace_byte = param.workspace << 20;
+  cudnnTensorDescriptor_t in_desc;
+  cudnnTensorDescriptor_t out_desc;
+  cudnnTensorDescriptor_t bias_desc;
+  cudnnFilterDescriptor_t filter_desc;
+  cudnnConvolutionDescriptor_t conv_desc;
+  CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc), CUDNN_STATUS_SUCCESS);
+#if CUDNN_MAJOR == 5
+  if (in_shape->at(0).ndim() == 4) {
+    // 2d conv
+    CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
+                                        dtype,
+                                        format,
+                                        param.num_filter / param.num_group,
+                                        x_shape[1] / param.num_group,
+                                        param.kernel[0],
+                                        param.kernel[1]), CUDNN_STATUS_SUCCESS);
+  } else {
+    // 3d conv, only support CUDNN v5
+    std::vector<int> filter_vec = {static_cast<int>(param.num_filter / param.num_group),
+                                   static_cast<int>(x_shape[1] / param.num_group),
+                                   static_cast<int>(param.kernel[0]),
+                                   static_cast<int>(param.kernel[1]),
+                                   static_cast<int>(param.kernel[2])};
+
+    std::vector<int> pad_vec = {static_cast<int>(param.pad[0]),
+                                static_cast<int>(param.pad[1]),
+                                static_cast<int>(param.pad[2])};
+
+    std::vector<int> stride_vec = {static_cast<int>(param.stride[0]),
+                                   static_cast<int>(param.stride[1]),
+                                   static_cast<int>(param.stride[2])};
+
+    std::vector<int> upscale_vec = {1, 1, 1};
+    CHECK_EQ(cudnnSetConvolutionNdDescriptor(conv_desc,
+                                             3,
+                                             &pad_vec[0],
+                                             &stride_vec[0],
+                                             &upscale_vec[0],
+                                             CUDNN_CROSS_CORRELATION,
+                                             dtype), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetFilterNdDescriptor(filter_desc,
+                                        dtype,
+                                        format,
+                                        static_cast<int>(filter_vec.size()),
+                                        &filter_vec[0]), CUDNN_STATUS_SUCCESS);
+  }
+#else
+  CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc,
+                                      dtype,
+                                      param.num_filter / param.num_group,
+                                      x_shape[1] / param.num_group,
+                                      param.kernel[0],
+                                      param.kernel[1]), CUDNN_STATUS_SUCCESS);
+#endif
+  if (param.kernel.ndim() == 2) {
+    // 2d conv
+    CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc,
+                                             param.pad[0],
+                                             param.pad[1],
+                                             param.stride[0],
+                                             param.stride[1],
+                                             1,
+                                             1,
+                                             CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc,
+                                          dtype,
+                                          x_shape[0],
+                                          x_shape[1] / param.num_group,
+                                          x_shape[2],
+                                          x_shape[3],
+                                          x_shape[1] * x_shape[2] * x_shape[3],
+                                          x_shape[2] * x_shape[3],
+                                          x_shape[3],
+                                          1), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc,
+                                          dtype,
+                                          y_shape[0],
+                                          y_shape[1] / param.num_group,
+                                          y_shape[2],
+                                          y_shape[3],
+                                          y_shape[1] * y_shape[2] * y_shape[3],
+                                          y_shape[2] * y_shape[3],
+                                          y_shape[3],
+                                          1), CUDNN_STATUS_SUCCESS);
+    if (!param.no_bias) {
+      TShape bias_shape = (*in_shape)[conv::kBias];
+      CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc,
+                                          CUDNN_TENSOR_NCHW,
+                                          dtype,
+                                          1,
+                                          bias_shape[0] / param.num_group,
+                                          1,
+                                          1), CUDNN_STATUS_SUCCESS);
+    }
+  } else {
+    // 3d conv
+    std::vector<int> ishape = {static_cast<int>(in_shape->at(conv::kData)[0]),
+                               static_cast<int>(in_shape->at(conv::kData)[1]),
+                               static_cast<int>(in_shape->at(conv::kData)[2]),
+                               static_cast<int>(in_shape->at(conv::kData)[3]),
+                               static_cast<int>(in_shape->at(conv::kData)[4])};
+
+    std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                static_cast<int>(ishape[3] * ishape[4]),
+                                static_cast<int>(ishape[4]),
+                                1};
+
+    std::vector<int> oshape = {static_cast<int>(out_shape->at(conv::kOut)[0]),
+                               static_cast<int>(out_shape->at(conv::kOut)[1]),
+                               static_cast<int>(out_shape->at(conv::kOut)[2]),
+                               static_cast<int>(out_shape->at(conv::kOut)[3]),
+                               static_cast<int>(out_shape->at(conv::kOut)[4])};
+
+    std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                static_cast<int>(oshape[3] * oshape[4]),
+                                static_cast<int>(oshape[4]),
+                                1};
+    CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc,
+                                        dtype,
+                                        static_cast<int>(ishape.size()),
+                                        &ishape[0],
+                                        &istride[0]), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc,
+                                        dtype,
+                                        static_cast<int>(oshape.size()),
+                                        &oshape[0],
+                                        &ostride[0]), CUDNN_STATUS_SUCCESS);
+    if (!param.no_bias) {
+      TShape bias_shape = (*in_shape)[conv::kBias];
+      index_t bias_offset = bias_shape[0] / param.num_group;
+      std::vector<int> bshape = {1, static_cast<int>(bias_shape[0] / param.num_group),
+                                     1, 1, 1};
+      std::vector<int> bias_stride = {static_cast<int>(bias_offset), 1, 1, 1, 1};
+      CHECK_EQ(cudnnSetTensorNdDescriptor(bias_desc,
+                                          dtype,
+                                          static_cast<int>(bshape.size()),
+                                          &bshape[0],
+                                          &bias_stride[0]), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  Engine::VarHandle var = Engine::Get()->NewVariable();
+  Engine::Get()->PushSync([=](RunContext rctx) {
+    Stream<gpu> *s = rctx.get_stream<gpu>();
+    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+    const int kMaxAlgos = 10;
+    int nalgo = kMaxAlgos;
+    int i;
+
+    cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos];
+    CHECK_EQ(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_,
+             in_desc,
+             filter_desc,
+             conv_desc,
+             out_desc,
+             kMaxAlgos,
+             &nalgo,
+             fwd_algo), CUDNN_STATUS_SUCCESS);
+    i = 0;
+    while (i < nalgo
+           && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
+           || (param.cudnn_tune == conv::kLimited
+           && fwd_algo[i].memory > workspace_byte))) ++i;
+    if (i == nalgo) {
+      LOG(FATAL) << "Failed to find an convolution algorithm.";
+    } else {
+      *algo = fwd_algo[i].algo;
+    }
+
+    cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos];
+    CHECK_EQ(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+             in_desc,
+             out_desc,
+             conv_desc,
+             filter_desc,
+             kMaxAlgos,
+             &nalgo,
+             bwd_filter_algo), CUDNN_STATUS_SUCCESS);
+    i = 0;
+    while (i < nalgo
+           && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
+           || (param.cudnn_tune == conv::kLimited
+           && bwd_filter_algo[i].memory > workspace_byte))) ++i;
+    if (i == nalgo) {
+      LOG(FATAL) << "Failed to find an convolution algorithm.";
+    } else {
+      *back_algo_w = bwd_filter_algo[i].algo;
+    }
+
+    cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos];
+    CHECK_EQ(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+             filter_desc,
+             out_desc,
+             conv_desc,
+             in_desc,
+             kMaxAlgos,
+             &nalgo,
+             bwd_data_algo), CUDNN_STATUS_SUCCESS);
+    i = 0;
+    while (i < nalgo
+           && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
+           || (param.cudnn_tune == conv::kLimited
+           && bwd_data_algo[i].memory > workspace_byte))) ++i;
+    if (i == nalgo) {
+      LOG(FATAL) << "Failed to find an convolution algorithm.";
+    } else {
+      *back_algo = bwd_data_algo[i].algo;
+    }
+  }, ctx, {}, {var});
+  Engine::Get()->WaitForVar(var);
+  Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
+
+  conv::CudnnAlgorithms algs;
+  algs.fwd = *algo;
+  algs.bwd = *back_algo;
+  algs.flt = *back_algo_w;
+  std::lock_guard<std::mutex> guard(conv::g_reg_mutex);
+  conv::g_cudnn_algo_reg[key] = algs;
+
+  CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyTensorDescriptor(bias_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyFilterDescriptor(filter_desc), CUDNN_STATUS_SUCCESS);
+  CHECK_EQ(cudnnDestroyConvolutionDescriptor(conv_desc), CUDNN_STATUS_SUCCESS);
+}
+#endif  // CUDNN
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
index f804419b9c4f..b937b88e1bcd 100644
--- a/src/operator/cudnn_deconvolution-inl.h
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -49,6 +49,7 @@ class CuDNNDeconvolutionOp : public Operator {
     Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
+
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(wmat.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
@@ -216,6 +217,8 @@ class CuDNNDeconvolutionOp : public Operator {
       size_t back_size_w = 0;
       Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
       Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
+      index_t pad_y, pad_x, adj_y, adj_x;
+      param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
       data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
       out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
       weight_offset_ = data.shape_[1] / param_.num_group * param_.num_filter / param_.num_group
@@ -242,8 +245,8 @@ class CuDNNDeconvolutionOp : public Operator {
                                           param_.kernel[1]), CUDNN_STATUS_SUCCESS);
       #endif
       CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
-                                               param_.pad[0],
-                                               param_.pad[1],
+                                               pad_y,
+                                               pad_x,
                                                param_.stride[0],
                                                param_.stride[1],
                                                1,
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index c7fa214aa55a..e995a1b289b0 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -51,27 +51,46 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-    if (!init_cudnn_) {
-      this->Init(s, in_data, out_data);
-    }
-    if (param_.global_pool) {
-      this->InitGlobalPool(data.shape_);
-    }
     float alpha = 1.0f;
     float beta = 0.0f;
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
-    CHECK_EQ(cudnnPoolingForward(s->dnn_handle_,
-                                 pooling_desc_,
-                                 &alpha,
-                                 in_desc_,
-                                 data.dptr_,
-                                 &beta,
-                                 out_desc_,
-                                 out.dptr_), CUDNN_STATUS_SUCCESS);
+    if (param_.kernel.ndim() == 2) {
+      // 2d pool
+      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      if (!init_cudnn_) {
+        this->Init(s, in_data, out_data);
+      }
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      CHECK_EQ(cudnnPoolingForward(s->dnn_handle_,
+                                   pooling_desc_,
+                                   &alpha,
+                                   in_desc_,
+                                   data.dptr_,
+                                   &beta,
+                                   out_desc_,
+                                   out.dptr_), CUDNN_STATUS_SUCCESS);
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d pool
+      Tensor<gpu, 5> data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> out = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      if (!init_cudnn_) {
+        this->Init(s, in_data, out_data);
+      }
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      CHECK_EQ(cudnnPoolingForward(s->dnn_handle_,
+                                   pooling_desc_,
+                                   &alpha,
+                                   in_desc_,
+                                   data.dptr_,
+                                   &beta,
+                                   out_desc_,
+                                   out.dptr_), CUDNN_STATUS_SUCCESS);
+    } else {
+      LOG(FATAL) << "Only support 2D or 3D pooling";
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -90,14 +109,34 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(in_grad.size(), 1);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     float alpha = 1.0f;
     float beta = 0.0f;
-    CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
+    if (param_.kernel.ndim() == 2) {
+      // 2d pool
+      Tensor<gpu, 4> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> m_in_data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> m_out_data = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4> m_in_grad = in_grad[pool_enum::kData].get<gpu, 4, real_t>(s);
+      CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
+                                  pooling_desc_,
+                                  &alpha,
+                                  out_desc_,
+                                  m_out_data.dptr_,
+                                  out_desc_,
+                                  m_out_grad.dptr_,
+                                  in_desc_,
+                                  m_in_data.dptr_,
+                                  &beta,
+                                  in_desc_,
+                                  m_in_grad.dptr_), CUDNN_STATUS_SUCCESS);
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d pool
+      Tensor<gpu, 5> m_out_grad = out_grad[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> m_in_data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> m_out_data = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+      Tensor<gpu, 5> m_in_grad = in_grad[pool_enum::kData].get<gpu, 5, real_t>(s);
+      CHECK_EQ(cudnnPoolingBackward(s->dnn_handle_,
                                   pooling_desc_,
                                   &alpha,
                                   out_desc_,
@@ -109,34 +148,12 @@ class CuDNNPoolingOp : public Operator {
                                   &beta,
                                   in_desc_,
                                   m_in_grad.dptr_), CUDNN_STATUS_SUCCESS);
+    } else {
+      LOG(FATAL) << "Only support 2D or 3D pooling";
+    }
   }
 
  private:
-  inline void InitGlobalPool(const mshadow::Shape<4> &dshape) {
-    #if CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           nan_prop_,
-                                           param_.global_pool ? dshape[2] : param_.kernel[0],
-                                           param_.global_pool ? dshape[3] : param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.global_pool ? 1 : param_.stride[0],
-                                           param_.global_pool ? 1 :param_.stride[1]),
-               CUDNN_STATUS_SUCCESS);
-      #else
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           param_.global_pool ? dshape[2] : param_.kernel[0],
-                                           param_.global_pool ? dshape[3] : param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.global_pool ? 1 : param_.stride[0],
-                                           param_.global_pool ? 1 : param_.stride[1]),
-               CUDNN_STATUS_SUCCESS);
-      #endif
-  }
-
   inline void Init(mshadow::Stream<gpu> *s,
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
@@ -148,45 +165,117 @@ class CuDNNPoolingOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
-      CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          data.shape_[0],
-                                          data.shape_[1],
-                                          data.shape_[2],
-                                          data.shape_[3]), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          out.shape_[0],
-                                          out.shape_[1],
-                                          out.shape_[2],
-                                          out.shape_[3]), CUDNN_STATUS_SUCCESS);
-      #if CUDNN_MAJOR == 5
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           nan_prop_,
-                                           param_.kernel[0],
-                                           param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.stride[0],
-                                           param_.stride[1]), CUDNN_STATUS_SUCCESS);
-      #else
-      CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
-                                           mode_,
-                                           param_.kernel[0],
-                                           param_.kernel[1],
-                                           param_.pad[0],
-                                           param_.pad[1],
-                                           param_.stride[0],
-                                           param_.stride[1]), CUDNN_STATUS_SUCCESS);
-      #endif
+      if (param_.kernel.ndim() == 2) {
+        // 2d conv
+        Tensor<gpu, 4> data = in_data[pool_enum::kData].get<gpu, 4, real_t>(s);
+        Tensor<gpu, 4> out = out_data[pool_enum::kOut].get<gpu, 4, real_t>(s);
+        mshadow::Shape<4> dshape = data.shape_;
+        CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            data.shape_[0],
+                                            data.shape_[1],
+                                            data.shape_[2],
+                                            data.shape_[3]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+                                            CUDNN_TENSOR_NCHW,
+                                            dtype_,
+                                            out.shape_[0],
+                                            out.shape_[1],
+                                            out.shape_[2],
+                                            out.shape_[3]), CUDNN_STATUS_SUCCESS);
+        #if CUDNN_MAJOR == 5
+          CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                               mode_,
+                                               nan_prop_,
+                                               param_.global_pool ? dshape[2] : param_.kernel[0],
+                                               param_.global_pool ? dshape[3] : param_.kernel[1],
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.global_pool ? 1 : param_.stride[0],
+                                               param_.global_pool ? 1 :param_.stride[1]),
+                                               CUDNN_STATUS_SUCCESS);
+        #else
+          CHECK_EQ(cudnnSetPooling2dDescriptor(pooling_desc_,
+                                               mode_,
+                                               param_.global_pool ? dshape[2] : param_.kernel[0],
+                                               param_.global_pool ? dshape[3] : param_.kernel[1],
+                                               param_.pad[0],
+                                               param_.pad[1],
+                                               param_.global_pool ? 1 : param_.stride[0],
+                                               param_.global_pool ? 1 : param_.stride[1]),
+                                               CUDNN_STATUS_SUCCESS);
+        #endif
+      } else {
+        Tensor<gpu, 5> data = in_data[pool_enum::kData].get<gpu, 5, real_t>(s);
+        Tensor<gpu, 5> out = out_data[pool_enum::kOut].get<gpu, 5, real_t>(s);
+        CHECK_EQ(cudnnCreatePoolingDescriptor(&pooling_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+        std::vector<int> ishape = {static_cast<int>(data.shape_[0]),
+                                   static_cast<int>(data.shape_[1]),
+                                   static_cast<int>(data.shape_[2]),
+                                   static_cast<int>(data.shape_[3]),
+                                   static_cast<int>(data.shape_[4])};
+
+        std::vector<int> istride = {static_cast<int>(ishape[1] * ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[2] * ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[3] * ishape[4]),
+                                    static_cast<int>(ishape[4]),
+                                    1};
+
+        std::vector<int> oshape = {static_cast<int>(out.shape_[0]),
+                                   static_cast<int>(out.shape_[1]),
+                                   static_cast<int>(out.shape_[2]),
+                                   static_cast<int>(out.shape_[3]),
+                                   static_cast<int>(out.shape_[4])};
+
+        std::vector<int> ostride = {static_cast<int>(oshape[1] * oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[2] * oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[3] * oshape[4]),
+                                    static_cast<int>(oshape[4]),
+                                    1};
+
+        std::vector<int> kernel_vec = {param_.global_pool ? ishape[2] :
+                                                            static_cast<int>(param_.kernel[0]),
+                                       param_.global_pool ? ishape[3] :
+                                                            static_cast<int>(param_.kernel[1]),
+                                       param_.global_pool ? ishape[4] :
+                                                            static_cast<int>(param_.kernel[2])};
+
+        std::vector<int> pad_vec = {param_.global_pool ? 0 : static_cast<int>(param_.pad[0]),
+                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[1]),
+                                    param_.global_pool ? 0 : static_cast<int>(param_.pad[2])};
+
+        std::vector<int> stride_vec = {param_.global_pool ? 1 : static_cast<int>(param_.stride[0]),
+                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[1]),
+                                       param_.global_pool ? 1 : static_cast<int>(param_.stride[2])};
+
+        CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc_,
+                                            dtype_,
+                                            static_cast<int>(ishape.size()),
+                                            &ishape[0],
+                                            &istride[0]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc_,
+                                            dtype_,
+                                            static_cast<int>(oshape.size()),
+                                            &oshape[0],
+                                            &ostride[0]), CUDNN_STATUS_SUCCESS);
+        #if CUDNN_MAJOR == 5
+        CHECK_EQ(cudnnSetPoolingNdDescriptor(pooling_desc_,
+                                             mode_,
+                                             nan_prop_,
+                                             static_cast<int>(kernel_vec.size()),
+                                             &(kernel_vec[0]),
+                                             &(pad_vec[0]),
+                                             &(stride_vec[0])), CUDNN_STATUS_SUCCESS);
+        #else
+        LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve";
+        #endif
+      }
     }
   }
   bool init_cudnn_;
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
new file mode 100644
index 000000000000..5707846a781f
--- /dev/null
+++ b/src/operator/cudnn_rnn-inl.h
@@ -0,0 +1,504 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file cudnn_rnn-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_CUDNN_RNN_INL_H_
+#define MXNET_OPERATOR_CUDNN_RNN_INL_H_
+
+#include <vector>
+#include <map>
+#include <string>
+#include <utility>
+#include <cstdint>
+#include "./rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+template<typename DType>
+class CuDNNRNNOp : public Operator {
+ public:
+  explicit CuDNNRNNOp(RNNParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    // Defaults
+    input_mode_ = CUDNN_LINEAR_INPUT;  // Don't support this yet
+    // RNN Mode
+    switch (param_.mode) {
+      case rnn_enum::kRnnRelu:
+        mode_ = CUDNN_RNN_RELU;
+        break;
+      case rnn_enum::kRnnTanh:
+        mode_ = CUDNN_RNN_TANH;
+        break;
+      case rnn_enum::kLstm:
+        mode_ = CUDNN_LSTM;
+        break;
+      case rnn_enum::kGru:
+        mode_ = CUDNN_GRU;
+        break;
+      default:
+        LOG(FATAL) << "Not implmented";
+    }
+    // RNN Direction
+    direction_ = param_.bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL;
+    // Other
+    if (param_.mode == rnn_enum::kLstm)
+      param_.lstm_q_ = true;
+    else
+      param_.lstm_q_ = false;
+  }
+
+  ~CuDNNRNNOp() {
+    if (init_cudnn_) {
+      for (int i = 0; i < x_desc_vec_.size(); ++i) {
+        CHECK_EQ(cudnnDestroyTensorDescriptor(x_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(y_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dx_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnDestroyTensorDescriptor(dy_desc_vec_[i]), CUDNN_STATUS_SUCCESS);
+      }
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(cy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(dcy_desc_), CUDNN_STATUS_SUCCESS);
+
+      CHECK_EQ(cudnnDestroyFilterDescriptor(w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyRNNDescriptor(rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyDropoutDescriptor(dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudaFree(dropout_states_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+        out_expected = 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    void * hy_ptr = NULL;
+    if (param_.state_outputs)
+      hy_ptr = out_data[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    DType * cx_ptr = NULL;
+    DType * cy_ptr = NULL;
+    if (param_.mode == rnn_enum::kLstm) {
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      cy_ptr = (out_data[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+    }
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    if (ctx.is_train) {
+      CHECK_EQ(cudnnRNNForwardTraining(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy_ptr,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_,
+                                      temp_space.dptr_ + workspace_size_,
+                                      reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+    } else {
+      // inference mode
+      CHECK_EQ(cudnnRNNForwardInference(s->dnn_handle_,
+                                      rnn_desc_,
+                                      param_.seq_length_,
+                                      x_desc_vec_.data(),
+                                      x.dptr_,
+                                      hx_desc_,
+                                      hx.dptr_,
+                                      cx_desc_,
+                                      cx_ptr,
+                                      w_desc_,
+                                      w.dptr_,
+                                      y_desc_vec_.data(),
+                                      y.dptr_,
+                                      hy_desc_,
+                                      hy_ptr,
+                                      cy_desc_,
+                                      cy_ptr,
+                                      temp_space.dptr_,
+                                      workspace_byte_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+      out_expected = 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    CHECK_EQ(in_grad.size(), in_expected);
+    CHECK_EQ(out_grad.size(), out_expected);
+
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    // get input + output tensors
+    Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dx = in_grad[rnn_enum::kData].get<gpu, 3, DType>(s);
+    Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 1, DType> dw = in_grad[rnn_enum::kParams].get<gpu, 1, DType>(s);
+    Tensor<gpu, 3, DType> hx = in_data[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dhx = in_grad[rnn_enum::kState].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> y = out_data[rnn_enum::kOut].get<gpu, 3, DType>(s);
+    Tensor<gpu, 3, DType> dy = out_grad[rnn_enum::kOut].get<gpu, 3, DType>(s);
+
+    // only need kStateOut grad output_states is true
+    void * dhy_ptr = NULL;
+    if (param_.state_outputs)
+      dhy_ptr = out_grad[rnn_enum::kStateOut].get<gpu, 3, DType>(s).dptr_;
+
+    // Deal with lstm
+    void * dcx_ptr = NULL;
+    void * dcy_ptr = NULL;
+    void * cx_ptr = NULL;
+
+    if (param_.mode == rnn_enum::kLstm) {
+      cx_ptr = (in_data[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+      dcx_ptr = (in_grad[rnn_enum::kStateCell].get<gpu, 3, DType>(s)).dptr_;
+    }
+    if ((param_.mode == rnn_enum::kLstm) && param_.state_outputs)
+        dcy_ptr = (out_grad[rnn_enum::kStateCellOut].get<gpu, 3, DType>(s)).dptr_;
+
+    CHECK_EQ(x.CheckContiguous(), true);
+    CHECK_EQ(w.CheckContiguous(), true);
+    CHECK_EQ(hx.CheckContiguous(), true);
+    CHECK_EQ(y.CheckContiguous(), true);
+
+    if (!init_cudnn_) {
+      Init(s, in_data, out_data);
+    }
+
+    // Get temp space
+    int temp_size = workspace_size_;
+    temp_size += ctx.is_train ? reserve_space_size_ : 0;
+    Tensor<gpu, 1, DType> temp_space =
+      ctx.requested[rnn_enum::kTempSpace].get_space_typed<gpu, 1, DType>(
+                              mshadow::Shape1(temp_size), s);
+    CHECK_EQ(cudnnRNNBackwardData(s->dnn_handle_,
+                                rnn_desc_,
+                                param_.seq_length_,
+                                y_desc_vec_.data(),
+                                y.dptr_,
+                                dy_desc_vec_.data(),
+                                dy.dptr_,
+                                dhy_desc_,
+                                dhy_ptr,
+                                dcy_desc_,
+                                dcy_ptr,
+                                w_desc_,
+                                w.dptr_,
+                                hx_desc_,
+                                hx.dptr_,
+                                cx_desc_,
+                                cx_ptr,
+                                dx_desc_vec_.data(),
+                                dx.dptr_,
+                                dhx_desc_,
+                                dhx.dptr_,
+                                dcx_desc_,
+                                dcx_ptr,
+                                temp_space.dptr_,
+                                workspace_byte_,
+                                temp_space.dptr_ + workspace_size_,
+                                reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnRNNBackwardWeights(s->dnn_handle_,
+                                    rnn_desc_,
+                                    param_.seq_length_,
+                                    x_desc_vec_.data(),
+                                    x.dptr_,
+                                    hx_desc_,
+                                    hx.dptr_,
+                                    y_desc_vec_.data(),
+                                    y.dptr_,
+                                    temp_space.dptr_,
+                                    workspace_byte_,
+                                    dw_desc_,
+                                    dw.dptr_,
+                                    temp_space.dptr_ + workspace_size_,
+                                    reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+  }
+
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    #if CUDNN_MAJOR == 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+    size_t in_expected = param_.lstm_q_ ? 4 : 3;
+    size_t out_expected = param_.lstm_q_ ? 3 : 2;
+    if (!param_.state_outputs)
+      out_expected = 1;
+
+    CHECK_EQ(in_data.size(), in_expected);
+    CHECK_EQ(out_data.size(), out_expected);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      // get input + output tensors
+      Tensor<gpu, 3, DType> x = in_data[rnn_enum::kData].get<gpu, 3, DType>(s);
+      Tensor<gpu, 1, DType> w = in_data[rnn_enum::kParams].get<gpu, 1, DType>(s);
+      param_.seq_length_ = x.shape_[0];
+      param_.batch_size_ = x.shape_[1];
+      param_.input_size_ = x.shape_[2];
+
+      // Tensor Descriptors
+      std::vector<cudnnTensorDescriptor_t> x_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> y_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dx_vec(param_.seq_length_);
+      std::vector<cudnnTensorDescriptor_t> dy_vec(param_.seq_length_);
+      int dimA[3];
+      int strideA[3];
+      for (int i = 0; i < param_.seq_length_; i++) {
+        CHECK_EQ(cudnnCreateTensorDescriptor(&x_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&y_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dx_vec[i]), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnCreateTensorDescriptor(&dy_vec[i]), CUDNN_STATUS_SUCCESS);
+
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        dimA[2] = 1;
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.input_size_;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CHECK_EQ(cudnnSetTensorNdDescriptor(x_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dx_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        dimA[0] = param_.batch_size_;
+        dimA[1] = param_.bidirectional ? param_.state_size * 2 : param_.state_size;
+        dimA[2] = 1;
+        strideA[0] = dimA[2] * dimA[1];
+        strideA[1] = dimA[2];
+        strideA[2] = 1;
+
+        CHECK_EQ(cudnnSetTensorNdDescriptor(y_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnSetTensorNdDescriptor(dy_vec[i],
+                                  dtype_,
+                                  3,
+                                  dimA,
+                                  strideA), CUDNN_STATUS_SUCCESS);
+      }
+      x_desc_vec_ = x_vec;
+      y_desc_vec_ = y_vec;
+      dx_desc_vec_ = dx_vec;
+      dy_desc_vec_ = dy_vec;
+
+      // set the state tensors
+      dimA[0] = param_.num_layers * (param_.bidirectional ? 2 : 1);
+      dimA[1] = param_.batch_size_;
+      dimA[2] = param_.state_size;
+      strideA[0] = dimA[2] * dimA[1];
+      strideA[1] = dimA[2];
+      strideA[2] = 1;
+
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&hy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&cy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcx_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dhy_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&dcy_desc_), CUDNN_STATUS_SUCCESS);
+
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(hy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(cy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcx_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dhy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensorNdDescriptor(dcy_desc_,
+                                          dtype_,
+                                          3,
+                                          dimA,
+                                          strideA), CUDNN_STATUS_SUCCESS);
+
+      // Create Dropout descriptors
+      CHECK_EQ(cudnnCreateDropoutDescriptor(&dropout_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDropoutGetStatesSize(s->dnn_handle_,
+                                        &dropout_byte_), CUDNN_STATUS_SUCCESS);
+      dropout_size_ = dropout_byte_ / sizeof(DType);
+      CHECK_EQ(cudaMalloc(&dropout_states_, dropout_byte_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetDropoutDescriptor(dropout_desc_,
+                                        s->dnn_handle_,
+                                        param_.p,  // keep probability
+                                        dropout_states_,
+                                        dropout_byte_,
+                                        seed_), CUDNN_STATUS_SUCCESS);
+      // RNN descriptors
+      CHECK_EQ(cudnnCreateRNNDescriptor(&rnn_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetRNNDescriptor(rnn_desc_,
+                                    param_.state_size,
+                                    param_.num_layers,
+                                    dropout_desc_,
+                                    input_mode_,
+                                    direction_,
+                                    mode_,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      // Get temp space sizes
+      CHECK_EQ(cudnnGetRNNWorkspaceSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &workspace_byte_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnGetRNNTrainingReserveSize(s->dnn_handle_,
+                                        rnn_desc_,
+                                        param_.seq_length_,
+                                        x_desc_vec_.data(),
+                                        &reserve_space_byte_), CUDNN_STATUS_SUCCESS);
+      workspace_size_ = workspace_byte_ / sizeof(DType);
+      reserve_space_size_ = reserve_space_byte_ / sizeof(DType);
+
+      // check that number of params are correct
+      size_t cudnn_param_size;
+      CHECK_EQ(cudnnGetRNNParamsSize(s->dnn_handle_,
+                                    rnn_desc_,
+                                    x_desc_vec_[0],
+                                    &cudnn_param_size,
+                                    dtype_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(w.shape_[0] * sizeof(DType), cudnn_param_size);
+
+      // Set param descriptors
+      CHECK_EQ(cudnnCreateFilterDescriptor(&w_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateFilterDescriptor(&dw_desc_), CUDNN_STATUS_SUCCESS);
+      int dim_w[3] = {1, 1, 1};
+      dim_w[0] = w.shape_[0];
+      CHECK_EQ(cudnnSetFilterNdDescriptor(w_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetFilterNdDescriptor(dw_desc_,
+                                          dtype_,
+                                          format_,
+                                          3,
+                                          dim_w), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  cudnnDataType_t dtype_;
+  bool init_cudnn_;
+  cudnnRNNDescriptor_t rnn_desc_;
+  cudnnRNNMode_t mode_;
+  cudnnDirectionMode_t direction_;
+  cudnnRNNInputMode_t input_mode_;
+  cudnnDropoutDescriptor_t dropout_desc_;
+  void *dropout_states_;
+  uint64_t seed_ = 1337ull;
+  size_t workspace_byte_, reserve_space_byte_, dropout_byte_;
+  int workspace_size_, reserve_space_size_, dropout_size_;
+
+  std::vector<cudnnTensorDescriptor_t> x_desc_vec_, y_desc_vec_, dx_desc_vec_, dy_desc_vec_;
+  cudnnTensorDescriptor_t hx_desc_, cx_desc_;
+  cudnnTensorDescriptor_t hy_desc_, cy_desc_;
+  cudnnTensorDescriptor_t dhx_desc_, dcx_desc_;
+  cudnnTensorDescriptor_t dhy_desc_, dcy_desc_;
+
+  cudnnFilterDescriptor_t w_desc_, dw_desc_;
+
+  #if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format_;
+  #endif
+  RNNParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_RNN_INL_H_
diff --git a/src/operator/cudnn_spatial_transformer-inl.h b/src/operator/cudnn_spatial_transformer-inl.h
new file mode 100644
index 000000000000..12e30b603582
--- /dev/null
+++ b/src/operator/cudnn_spatial_transformer-inl.h
@@ -0,0 +1,182 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file cudnn_spatial_transformer-inl.h
+ * \brief
+ * \author Wei Wu
+*/
+#ifndef MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
+
+#include <algorithm>
+#include <vector>
+#include "./spatial_transformer-inl.h"
+namespace mxnet {
+namespace op {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+template<typename DType>
+class CuDNNSpatialTransformerOp : public Operator {
+ public:
+  explicit CuDNNSpatialTransformerOp(SpatialTransformerParam param) {
+    this->param_ = param;
+    init_cudnn_ = false;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+    if (param_.sampler_type == st::kBilinear) {
+      sampler_ = CUDNN_SAMPLER_BILINEAR;
+    }
+  }
+
+  ~CuDNNSpatialTransformerOp() {
+    if (init_cudnn_) {
+      CHECK_EQ(cudnnDestroySpatialTransformerDescriptor(st_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(in_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDestroyTensorDescriptor(out_desc_), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Shape<4> grid_shape = Shape4(out.size(0), out.size(2), out.size(3), 2);
+    Tensor<gpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+                                .get_with_shape<gpu, 4, DType>(grid_shape, s);
+    if (!init_cudnn_) {
+     Init(s, in_data, out_data);
+    }
+    CHECK_EQ(data.CheckContiguous(), true);
+    CHECK_EQ(out.CheckContiguous(), true);
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
+    if (param_.transform_type == st::kAffine) {
+      CHECK_EQ(cudnnSpatialTfGridGeneratorForward(s->dnn_handle_,
+                                                  st_desc_,
+                                                  loc.dptr_,
+                                                  grid.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+    }
+    CHECK_EQ(cudnnSpatialTfSamplerForward(s->dnn_handle_,
+                                          st_desc_,
+                                          &alpha,
+                                          in_desc_,
+                                          data.dptr_,
+                                          grid.dptr_,
+                                          &beta,
+                                          out_desc_,
+                                          out.dptr_/*output*/), CUDNN_STATUS_SUCCESS);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(out_grad.size(), 1);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> ddata = in_grad[st::kData].get<gpu, 4, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Shape<4> grid_shape = Shape4(grad.size(0), grad.size(2), grad.size(3), 2);
+    Tensor<gpu, 3, DType> dloc = in_grad[st::kLoc].get_with_shape<gpu, 3, DType>(loc_shape, s);
+    Tensor<gpu, 4, DType> grid = out_data[st::kGridSrc]
+                    .get_with_shape<gpu, 4, DType>(grid_shape, s);
+    // do not use out_grad[st::kGridSrc], because dgrid is a intermediate tensor, and not include in
+    // DeclareBackwardDependency, another, we can we reuse grid for inplace operator
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
+    typename DataType<DType>::ScaleType alpha_dgrid = 1.0f;
+    typename DataType<DType>::ScaleType beta_dgrid = 0.0f;
+    CHECK_EQ(cudnnSpatialTfSamplerBackward(s->dnn_handle_,
+                                           st_desc_,
+                                           &alpha,
+                                           in_desc_,
+                                           data.dptr_,
+                                           &beta,
+                                           in_desc_/*reuse in_desc_*/,
+                                           ddata.dptr_/*output*/,
+                                           &alpha_dgrid,
+                                           out_desc_/*reuse out_desc_*/,
+                                           grad.dptr_,
+                                           grid.dptr_,
+                                           &beta_dgrid,
+                                           grid.dptr_/*output, reuse grid*/), CUDNN_STATUS_SUCCESS);
+    if (param_.transform_type == st::kAffine) {
+      CHECK_EQ(cudnnSpatialTfGridGeneratorBackward(s->dnn_handle_,
+                                                   st_desc_,
+                                                   grid.dptr_,
+                                                   dloc.dptr_/*out*/), CUDNN_STATUS_SUCCESS);
+    }
+  }
+
+ private:
+  inline void Init(mshadow::Stream<gpu> *s,
+                   const std::vector<TBlob> &in_data,
+                   const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    #if CUDNN_MAJOR == 5
+    format_ = CUDNN_TENSOR_NCHW;
+    #endif
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    if (!init_cudnn_) {
+      init_cudnn_ = true;
+      Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
+      CHECK_EQ(cudnnCreateSpatialTransformerDescriptor(&st_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(in_desc_,
+                                          format_,
+                                          dtype_,
+                                          data.size(0),
+                                          data.size(1),
+                                          data.size(2),
+                                          data.size(3)), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnSetTensor4dDescriptor(out_desc_,
+                                          format_,
+                                          dtype_,
+                                          out.size(0),
+                                          out.size(1),
+                                          out.size(2),
+                                          out.size(3)), CUDNN_STATUS_SUCCESS);
+      if (param_.sampler_type == st::kBilinear) {
+        int dim[] = {static_cast<int>(out.size(0)), static_cast<int>(out.size(1)),
+                     static_cast<int>(out.size(2)), static_cast<int>(out.size(3))};
+        CHECK_EQ(cudnnSetSpatialTransformerNdDescriptor(st_desc_,
+                                                        sampler_,
+                                                        dtype_,
+                                                        4,
+                                                        dim) , CUDNN_STATUS_SUCCESS);
+      }
+    }
+  }
+
+  bool init_cudnn_;
+  cudnnDataType_t dtype_;
+  cudnnSpatialTransformerDescriptor_t st_desc_;
+  cudnnTensorDescriptor_t in_desc_;
+  cudnnTensorDescriptor_t out_desc_;
+  cudnnSamplerType_t sampler_;
+  #if CUDNN_MAJOR == 5
+  cudnnTensorFormat_t format_;
+  #endif
+  SpatialTransformerParam param_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_SPATIAL_TRANSFORMER_INL_H_
diff --git a/src/operator/custom.cc b/src/operator/custom.cc
index b09bae006b82..09ab894044df 100644
--- a/src/operator/custom.cc
+++ b/src/operator/custom.cc
@@ -1,6 +1,6 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file ndarray_op.cc
+ * \file custom.cc
  * \brief
  * \author Junyuan Xie
 */
diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
index a1590956e8c7..c8a732f6f42b 100644
--- a/src/operator/deconvolution-inl.h
+++ b/src/operator/deconvolution-inl.h
@@ -31,6 +31,8 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   TShape kernel;
   TShape stride;
   TShape pad;
+  TShape adj;
+  TShape target_shape;
   uint32_t num_filter;
   uint32_t num_group;
   uint64_t workspace;
@@ -39,18 +41,51 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
     int shape[] = {1, 1};
     DMLC_DECLARE_FIELD(kernel).describe("deconvolution kernel size: (y, x)");
     DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
-    .describe("deconvolution stride: (y, x)");
+        .describe("deconvolution stride: (y, x)");
     shape[0] = shape[1] = 0;
     DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-    .describe("pad for deconvolution: (y, x)");
+        .describe("pad for deconvolution: (y, x), a good number is : (kernel-1)/2, "
+                  "if target_shape set, pad will be ignored and will be computed "
+                  "automatically");
+    DMLC_DECLARE_FIELD(adj).set_default(TShape(shape, shape + 2))
+        .describe("adjustment for output shape: (y, x), if target_shape set, adj "
+                  "will be ignored and will be computed automatically");
+    DMLC_DECLARE_FIELD(target_shape).set_default(TShape(shape, shape + 2))
+        .describe("output shape with targe shape : (y, x)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
-    .describe("deconvolution filter(channel) number");
+        .describe("deconvolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
-    .describe("number of groups partition");
+        .describe("number of groups partition");
     DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
-    .describe("Tmp workspace for deconvolution (MB)");
+        .describe("Tmp workspace for deconvolution (MB)");
     DMLC_DECLARE_FIELD(no_bias).set_default(true)
-    .describe("Whether to disable bias parameter.");
+        .describe("Whether to disable bias parameter.");
+  }
+
+  inline void InferPad(index_t input_y, index_t input_x,
+                       index_t* o_pad_y, index_t* o_pad_x,
+                       index_t* o_adj_y, index_t* o_adj_x) const {
+    index_t& pad_y = *o_pad_y;
+    index_t& pad_x = *o_pad_x;
+    index_t& adj_y = *o_adj_y;
+    index_t& adj_x = *o_adj_x;
+    if (target_shape[0] != 0 || target_shape[1] != 0) {
+      pad_y = stride[0] * (input_y - 1) + kernel[0];
+      pad_x = stride[1] * (input_x - 1) + kernel[1];
+      CHECK_GE(pad_y, target_shape[0])
+          << "too big target shape";
+      CHECK_GE(pad_x, target_shape[1])
+          << "too big target shape";
+      pad_y -= target_shape[0];
+      pad_x -= target_shape[1];
+      adj_y = pad_y % 2; pad_y = (pad_y + 1) / 2;
+      adj_x = pad_x % 2; pad_x = (pad_x + 1) / 2;
+    } else {
+      pad_y = pad[0];
+      pad_x = pad[1];
+      adj_y = adj[0];
+      adj_x = adj[1];
+    }
   }
 };
 
@@ -77,6 +112,10 @@ class DeconvolutionOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> data = in_data[deconv::kData].get<xpu, 4, DType>(s);
     Tensor<xpu, 4, DType> out = out_data[deconv::kOut].get<xpu, 4, DType>(s);
+
+    index_t pad_y, pad_x, adj_y, adj_x;
+    param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
+
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
                data.shape_[1] / param_.num_group,
@@ -103,7 +142,7 @@ class DeconvolutionOp : public Operator {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+      if (pad_y == 0 && pad_x == 0) {
         temp_col = unpack_patch2col(out.Slice(i, i + step),
                                     param_.kernel[0],
                                     param_.kernel[1],
@@ -112,7 +151,7 @@ class DeconvolutionOp : public Operator {
                                     1, 1);  // Deconvolution only support dilate equals 1
       } else {
         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-                                        param_.pad[0], param_.pad[1]),
+                                        pad_y, pad_x),
                                     param_.kernel[0],
                                     param_.kernel[1],
                                     param_.stride[0],
@@ -125,7 +164,7 @@ class DeconvolutionOp : public Operator {
                                               gstride * (gid + 1));
         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
       }
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+      if (pad_y == 0 && pad_x == 0) {
         out.Slice(i, i + step) = pack_col2patch(temp_col,
                                    out.Slice(i, i + step).shape_,
                                    param_.kernel[0],
@@ -134,8 +173,8 @@ class DeconvolutionOp : public Operator {
                                    1);  // Deconvolution only support dilate equals 1
       } else {
         Shape<4> pshape = out.Slice(i, i + step).shape_;
-        pshape[2] += 2 * param_.pad[0];
-        pshape[3] += 2 * param_.pad[1];
+        pshape[2] += 2 * pad_y;
+        pshape[3] += 2 * pad_x;
         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
                                         pshape,
                                         param_.kernel[0],
@@ -184,6 +223,9 @@ class DeconvolutionOp : public Operator {
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
+    index_t pad_y, pad_x, adj_y, adj_x;
+    param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
+
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<xpu, 1, DType>(
@@ -200,7 +242,7 @@ class DeconvolutionOp : public Operator {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+      if (pad_y == 0 && pad_x == 0) {
         temp_col = unpack_patch2col(grad.Slice(i, i + step),
                                      param_.kernel[0],
                                      param_.kernel[1],
@@ -208,7 +250,7 @@ class DeconvolutionOp : public Operator {
                                      param_.stride[1],
                                      1, 1);  // Deconvolution only support dilate equals 1
       } else {
-        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), pad_y, pad_x),
                                      param_.kernel[0],
                                      param_.kernel[1],
                                      param_.stride[0],
@@ -323,8 +365,11 @@ class DeconvolutionProp : public OperatorProperty {
     }
     out_shape->clear();
     out_shape->push_back(dshape);
+    // osize = stride * (isize - 1) + ksize - 2 * pad + adj
     const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
     const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+    index_t pad_y, pad_x, adj_y, adj_x;
+    param_.InferPad(dshape[2], dshape[3], &pad_y, &pad_x, &adj_y, &adj_x);
     CHECK_EQ(dshape[1] % param_.num_group, 0) \
         << "input num_filter must divide group size";
     CHECK_EQ(param_.num_filter % param_.num_group, 0) \
@@ -333,11 +378,21 @@ class DeconvolutionProp : public OperatorProperty {
         << "incorrect kernel size: " << param_.kernel;
     CHECK_GT(param_.stride.Size(), 0) \
         << "incorrect stride size: " << param_.stride;
+    CHECK_GE(ksize_y-1, adj_y) << "adj(y) must be samller than kernel(h)";
+    CHECK_GE(ksize_x-1, adj_x) << "adj(x) must be samller than kernel(w)";
     (*out_shape)[deconv::kOut][1] = param_.num_filter;
     (*out_shape)[deconv::kOut][2] = param_.stride[0] * (dshape[2] - 1) +
-        ksize_y - 2 * param_.pad[0];
+        ksize_y - 2 * pad_y + adj_y;
     (*out_shape)[deconv::kOut][3] = param_.stride[1] * (dshape[3] - 1) +
-        ksize_x - 2 * param_.pad[1];
+        ksize_x - 2 * pad_x + adj_x;
+    if (param_.target_shape[0] > 0) {
+      CHECK_EQ(param_.target_shape[0], (*out_shape)[deconv::kOut][2]) \
+          << "param_.target_shape[0] was not reasonable, pelase set it carefully";
+    }
+    if (param_.target_shape[1] > 0) {
+      CHECK_EQ(param_.target_shape[1], (*out_shape)[deconv::kOut][3]) \
+          << "param_.target_shape[1] was not reasonable, pelase set it carefully";
+    }
     return true;
   }
 
diff --git a/src/operator/elementwise_binary_broadcast_op-inl.h b/src/operator/elementwise_binary_broadcast_op-inl.h
index b210998e2775..0723657cc3ef 100644
--- a/src/operator/elementwise_binary_broadcast_op-inl.h
+++ b/src/operator/elementwise_binary_broadcast_op-inl.h
@@ -1,5 +1,5 @@
 /*!
- *  Copyright (c) 2015 by Contributors
+ *  Copyright (c) 2016 by Contributors
  * \file elementwise_binary_broadcast_op-inl.h
  * \brief Function defintion of elementwise binary operators with broadcast
  *
@@ -26,16 +26,13 @@
  *
  * Here are examples of shapes that do not broadcast:
  *
- *   A      (3d tensor):  15 x 3 x 5
- *   B      (3d tensor):  15 x 1 x 5  # the diminsions for broadcasting should be continous
- *
  *   A      (1d tensor):  3
  *   B      (1d tensor):  4 # trailing dimensions do not match
  *
  *   A      (2d tensor):  1 x 2 x 1
  *   B      (3d tensor):  8 x 4 x 3 # second from last dimensions mismatched
  *
- * When no broadcast is need, it fails back to elementwise_binary_op-inl.h
+ * When no broadcast is need, it falls back to elementwise_binary_op-inl.h
  */
 #ifndef MXNET_OPERATOR_ELEMENTWISE_BINARY_BROADCAST_OP_INL_H_
 #define MXNET_OPERATOR_ELEMENTWISE_BINARY_BROADCAST_OP_INL_H_
@@ -44,6 +41,7 @@
 #include <algorithm>
 #include <vector>
 #include "./mshadow_op.h"
+#include "./broadcast_reduce_op_common.h"
 
 #if defined(__CUDACC__)
 #define XPU gpu
@@ -56,16 +54,14 @@ namespace op {
 
 inline bool IsBroadcastNeeded_(const TShape& lhs,
                               const TShape& rhs) {
-  // force ndim to be equal. do not smartly padding dims with 1s, which may
-  // confuse users
-  CHECK_EQ(lhs.ndim(), rhs.ndim());
+  // force ndim to be equal. do not smartly padding dims with 1s, which may confuse users
+  CHECK_EQ(lhs.ndim(), rhs.ndim()) << "lhs:" << lhs << " rhs:" << rhs;
   for (index_t i = 0; i < lhs.ndim(); ++i) {
     if (lhs[i] != rhs[i]) return true;
   }
   return false;
 }
 
-
 inline TShape BinaryBroadcastShape_(const TShape& lhs,
                                     const TShape& rhs,
                                     const EnvArguments& env) {
@@ -74,96 +70,66 @@ inline TShape BinaryBroadcastShape_(const TShape& lhs,
   for (size_t i = 0; i < ret.size(); ++i) {
     ret[i] = std::max(lhs[i], rhs[i]);
   }
-  // check
-  for (int h = 0; h < 2; ++h) {
-    const TShape& inp = h == 0 ? lhs : rhs;
-    int contdim = 0;
-    for (size_t i = 0; i < inp.ndim(); ++i) {
-      if (inp[i] != 1) {
-        CHECK_EQ(inp[i], ret[i]) << "broadcast error on index " << i << ". "
-                                 << "lhs = " << lhs << "; rhs = " << rhs;
-      }
-      if (inp[i] == ret[i]) {
-        if (i == 0 || inp[i-1] != ret[i-1]) ++contdim;
-      }
-    }
-    CHECK_LE(contdim, 1) << "broadcast dimensions are not continuous. "
-                         << "lhs = " << lhs << "; rhs = " << rhs;
-  }
   return TShape(ret.begin(), ret.end());
 }
 
-inline void GetBroadcastShape_(const TShape& lhs,
-                               const TShape& rhs,
-                               TShape* ret_reshaped,
-                               int* lhs_broadcast_axis,
-                               int* rhs_broadcast_axis) {
-  TShape ret = BinaryBroadcastShape_(lhs, rhs, EnvArguments());
-  int n = static_cast<int>(ret.ndim());
-  int pos[4] = {0, n, n, n};
-  for (int h = 0; h < 2; ++h) {
-    const TShape& inp = h == 0 ? lhs : rhs;
-    for (int i = 0; i < n; ++i) {
-      if (inp[i] == ret[i]) {
-        pos[h*2] = i; break;
-      }
-    }
-    for (int i = n; i > 0; --i) {
-      if (inp[i-1] == ret[i-1]) {
-        pos[h*2+1] = i; break;
-      }
-    }
-  }
-  bool no_broadcast_lhs = pos[0] == 0 && pos[1] == n;
-  bool no_broadcast_rhs = pos[2] == 0 && pos[3] == n;
-  int pos_ordered[4] = {0, -1, -1, n};
-  if (no_broadcast_lhs && no_broadcast_rhs) {
-    // no broadcast
-    LOG(FATAL) << "no broadcast is needed";
-  } else if (no_broadcast_lhs && !no_broadcast_rhs) {
-    // only broadcast rhs
-    *rhs_broadcast_axis = 1;
-    *lhs_broadcast_axis = -1;
-    pos_ordered[1] = pos[2];
-    pos_ordered[2] = pos[3];
-  } else if (!no_broadcast_lhs && no_broadcast_rhs) {
-    // only broadcast lhs
-    *rhs_broadcast_axis = -1;
-    *lhs_broadcast_axis = 1;
-    pos_ordered[1] = pos[0];
-    pos_ordered[2] = pos[1];
-  } else {
-    // broadcast both lhs and rhs
-    int p;
-    if (pos[0] <= pos[2]) {
-      CHECK(pos[0] == 0 && pos[1] == pos[2] && pos[3] == n)
-        << "broadcast shape error: lhs = " << lhs << "; rhs = " << rhs;
-      *lhs_broadcast_axis = 0;
-      *rhs_broadcast_axis = 1;
-      p = pos[1];
+inline void InferBroadcastNewShapes_(bool *do_opt,
+  TShape *new_lhs_shape, TShape *new_rhs_shape, TShape *new_out_shape,
+  const TShape &lhs_shape, const TShape &rhs_shape, const TShape &out_shape) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK((lhs_shape.ndim() == rhs_shape.ndim()) && (rhs_shape.ndim() == out_shape.ndim())) <<
+    "ndim inconsistency, lhs_shape=" << lhs_shape << ", rhs_shape=" << rhs_shape <<
+    ", out_shape=" << out_shape;
+  *do_opt = false;
+  TShape lhs_axes = GetBroadcastingAxes_(lhs_shape, out_shape);
+  TShape rhs_axes = GetBroadcastingAxes_(rhs_shape, out_shape);
+  bool lhs_contiguous, rhs_contiguous;
+  index_t lhs_broadcasting_size, rhs_broadcasting_size;
+  CheckContiguousAxes_(&lhs_contiguous, &lhs_broadcasting_size, lhs_axes, out_shape);
+  CheckContiguousAxes_(&rhs_contiguous, &rhs_broadcasting_size, rhs_axes, out_shape);
+  if (lhs_contiguous && rhs_contiguous && (lhs_axes.ndim() == 0 || rhs_axes.ndim() == 0)) {
+    *do_opt = true;
+    if (lhs_axes.ndim() == 0) {
+      index_t leading =
+        rhs_shape.ProdShape(0, rhs_axes[0]);
+      index_t trailing =
+        rhs_shape.ProdShape(rhs_axes[rhs_axes.ndim() - 1] + 1, rhs_shape.ndim());
+      *new_lhs_shape = Shape3(leading, rhs_broadcasting_size, trailing);
+      *new_rhs_shape = Shape3(leading, 1, trailing);
+      *new_out_shape = Shape3(leading, rhs_broadcasting_size, trailing);
     } else {
-      CHECK(pos[2] == 0 && pos[3] == pos[0] && pos[1] == n)
-        << "broadcast shape error: lhs = " << lhs << "; rhs = " << rhs;
-      *lhs_broadcast_axis = 1;
-      *rhs_broadcast_axis = 0;
-      p = pos[0];
+      index_t leading =
+        lhs_shape.ProdShape(0, lhs_axes[0]);
+      index_t trailing =
+        lhs_shape.ProdShape(lhs_axes[lhs_axes.ndim() - 1] + 1, lhs_shape.ndim());
+      *new_lhs_shape = Shape3(leading, 1, trailing);
+      *new_rhs_shape = Shape3(leading, lhs_broadcasting_size, trailing);
+      *new_out_shape = Shape3(leading, lhs_broadcasting_size, trailing);
     }
-    std::vector<index_t> dim(2, 1);
-    for (int i = 0; i < p; ++i) dim[0] *= ret[i];
-    for (int i = p; i < n; ++i) dim[1] *= ret[i];
-    *ret_reshaped = TShape(dim.begin(), dim.end());
-    return;
-  }
-  std::vector<index_t> dim(3, 1);
-  for (int i = 0; i < 3; ++i) {
-    for (int j = pos_ordered[i]; j < pos_ordered[i+1]; ++j) {
-      dim[i] *= ret[j];
+  } else {
+    *do_opt = false;
+    CHECK(lhs_shape.ndim() <= MXNET_SPECIAL_MAX_NDIM)
+      << "Only support input dimension up to " << MXNET_SPECIAL_MAX_NDIM
+      << ", lhs_shape=" << lhs_shape << ", rhs_shape=" << rhs_shape
+      << ", out_shape=" << out_shape;
+    *new_lhs_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
+    *new_rhs_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
+    *new_out_shape = TShape(MXNET_SPECIAL_MAX_NDIM);
+    for (index_t i = 0; i < lhs_shape.ndim(); i++) {
+      (*new_lhs_shape)[i] = lhs_shape[i];
+      (*new_rhs_shape)[i] = rhs_shape[i];
+      (*new_out_shape)[i] = out_shape[i];
     }
   }
-  *ret_reshaped = TShape(dim.begin(), dim.end());
+  CHECK(((*new_lhs_shape).Size() == lhs_shape.Size())
+    && ((*new_rhs_shape).Size() == rhs_shape.Size())
+    && ((*new_out_shape).Size() == out_shape.Size()))
+    << "new_lhs_shape:" << *new_lhs_shape << ",lhs_shape:" << lhs_shape
+    << "new_rhs_shape:" << *new_rhs_shape << ",rhs_shape:" << rhs_shape
+    << "new_out_shape:" << *new_out_shape << ",out_shape:" << out_shape;
 }
 
-
 template<typename xpu, typename OP>
 void BinaryBroadcastForward_(const TBlob& lhs,
                              const TBlob& rhs,
@@ -171,94 +137,61 @@ void BinaryBroadcastForward_(const TBlob& lhs,
                              TBlob *ret,
                              OpReqType req,
                              RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
     << "Binary function only support input/output with the same type";
   CHECK_EQ(ret->type_flag_, rhs.type_flag_)
     << "Binary function only support input/output with the same type";
-
+  CHECK_EQ(lhs.shape_.ndim(), rhs.shape_.ndim()) << "the ndim of lhs and rhs must be equal,"
+    " shape of lhs=" << lhs.shape_ << " shape of rhs=" << rhs.shape_;
   if (!IsBroadcastNeeded_(lhs.shape_, rhs.shape_)) {
     // no broadcast
     MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-        Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
-        ASSIGN_DISPATCH(out, req,
-                        F<OP>(lhs.FlatTo2D<xpu, DType>(s),
-                              rhs.FlatTo2D<xpu, DType>(s)));
-      });
+      mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      ASSIGN_DISPATCH(out, req,
+        F<OP>(lhs.FlatTo2D<xpu, DType>(s),
+        rhs.FlatTo2D<xpu, DType>(s)));
+    });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs.shape_, rhs.shape_, &ret_reshaped,
-                     &lhs_broadcast_axis, &rhs_broadcast_axis);
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs.shape_, rhs.shape_, ret->shape_);
   MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
-      if (lhs_broadcast_axis >= 0) {
-        // broadcast lhs
-        Tensor<xpu, 1, DType> mlhs =
-            lhs.get_with_shape<xpu, 1, DType>(Shape1(lhs.shape_.Size()), s);
-        if (rhs_broadcast_axis >= 0) {
-          // broadcast both
-          Tensor<xpu, 1, DType> mrhs =
-              rhs.get_with_shape<xpu, 1, DType>(Shape1(rhs.shape_.Size()), s);
-
-          Shape<2> ret_mshape = ret_reshaped.get<2>();
-          Tensor<xpu, 2, DType> out =
-              ret->get_with_shape<xpu, 2, DType>(ret_mshape, s);
-          if (lhs_broadcast_axis == 0) {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast<0>(mlhs, ret_mshape),
-                                  broadcast<1>(mrhs, ret_mshape)));
-          } else {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast<1>(mlhs, ret_mshape),
-                                  broadcast<0>(mrhs, ret_mshape)));
-          }
-        } else {
-          // only lhs
-          Shape<3> ret_mshape = ret_reshaped.get<3>();
-          Tensor<xpu, 3, DType> out =
-              ret->get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          Tensor<xpu, 3, DType> mrhs =
-              rhs.get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          if (lhs.shape_.Size() == 1) {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast_scalar(mlhs, ret_mshape), mrhs));
-          } else {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(broadcast<1>(mlhs, ret_mshape), mrhs));
-          }
-        }
-      } else {
-        Tensor<xpu, 1, DType> mrhs =
-            rhs.get_with_shape<xpu, 1, DType>(mshadow::Shape1(rhs.shape_.Size()), s);
-        if (rhs_broadcast_axis >= 0) {
-          // only rhs
-          Shape<3> ret_mshape = ret_reshaped.get<3>();
-          Tensor<xpu, 3, DType> out =
-              ret->get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          Tensor<xpu, 3, DType> mlhs =
-              lhs.get_with_shape<xpu, 3, DType>(ret_mshape, s);
-          if (lhs.shape_.Size() == 1) {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(mlhs, broadcast_scalar(mrhs, ret_mshape)));
-          } else {
-            ASSIGN_DISPATCH(out, req,
-                            F<OP>(mlhs, broadcast<1>(mrhs, ret_mshape)));
-          }
-        } else {
-          LOG(FATAL) << "no broadcast is needed";
-        }
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
-    });
+      Tensor<xpu, 3, DType> out = ret->get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      Tensor<xpu, 3, DType> mlhs = lhs.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      Tensor<xpu, 3, DType> mrhs = rhs.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      ASSIGN_DISPATCH(out, req,
+        F<OP>(broadcast_to(mlhs, out_new_shape_), broadcast_to(mrhs, out_new_shape_)));
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> out =
+        ret->get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs =
+        lhs.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs =
+        rhs.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      ASSIGN_DISPATCH(out, req,
+        F<OP>(broadcast_to(mlhs, out_new_shape_), broadcast_to(mrhs, out_new_shape_)));
+    }
+  });
 }
 
-
 template<typename xpu, typename LHS_OP, typename RHS_OP>
 void BinaryBroadcastBackward_(const OutputGrad& out_grad,
                               const EnvArguments& env,
@@ -267,13 +200,16 @@ void BinaryBroadcastBackward_(const OutputGrad& out_grad,
                               OpReqType req_lhs_grad,
                               OpReqType req_rhs_grad,
                               RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Shape2;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(out_grad.data.type_flag_, lhs_grad->type_flag_)
+    << "Binary function only support ingrad/outgrad with the same type";
+  CHECK_EQ(out_grad.data.type_flag_, rhs_grad->type_flag_)
+    << "Binary function only support ingrad/outgrad with the same type";
+  CHECK_EQ(rhs_grad->shape_.ndim(), rhs_grad->shape_.ndim()) <<
+    "the ndim of lhs_grad and rhs_grad must be equal,"
+    " shape of lhs_grad=" << lhs_grad->shape_ << " shape of rhs_grad=" << rhs_grad->shape_;
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
     // no broadcast
     MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
@@ -285,63 +221,39 @@ void BinaryBroadcastBackward_(const OutputGrad& out_grad,
       });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs_grad->shape_, rhs_grad->shape_, &ret_reshaped,
-                     &lhs_broadcast_axis, &rhs_broadcast_axis);
-  index_t lhs_size = lhs_grad->shape_.Size();
-  index_t rhs_size = rhs_grad->shape_.Size();
-
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
   MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      if (lhs_broadcast_axis >= 0) {
-        Tensor<xpu, 1, DType> mlhs_grad =
-            lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-        if (rhs_broadcast_axis >= 0) {
-          // broadcast both
-          Tensor<xpu, 2, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 2, DType>(ret_reshaped.get<2>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          if (lhs_broadcast_axis == 0) {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<0>(F<LHS_OP>(mout_grad)));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<1>(F<RHS_OP>(mout_grad)));
-          } else {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<1>(F<LHS_OP>(mout_grad)));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<0>(F<RHS_OP>(mout_grad)));
-          }
-        } else {
-          // only broadcast lhs
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          ASSIGN_DISPATCH(
-              mlhs_grad, req_lhs_grad, sumall_except_dim<1>(F<LHS_OP>(mout_grad)));
-          ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, F<RHS_OP>(mout_grad));
-        }
-      } else {
-        if (rhs_broadcast_axis >= 0) {
-          // only broadcast rhs
-          Tensor<xpu, 3, DType> mlhs_grad =
-              lhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, F<LHS_OP>(mout_grad));
-          ASSIGN_DISPATCH(
-              mrhs_grad, req_rhs_grad, sumall_except_dim<1>(F<RHS_OP>(mout_grad)));
-        } else {
-          LOG(FATAL) << "no broadcast is needed";
-        }
+    if (do_opt) {
+      Shape<3> out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        out_new_shape[i] = out_new_shape_[i];
       }
-    });
+      Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, F<LHS_OP>(mout_grad));
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_, F<RHS_OP>(mout_grad));
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, F<LHS_OP>(mout_grad));
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_, F<RHS_OP>(mout_grad));
+    }
+  });
 }
 
 template<typename xpu>
@@ -354,112 +266,71 @@ void BroadcastMulBackward_(const OutputGrad& out_grad,
                             OpReqType req_lhs_grad,
                             OpReqType req_rhs_grad,
                             RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Shape2;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
     MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-        Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
-        CHECK_NE(req_rhs_grad, kWriteInplace);
-        ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, mlhs_data * mout_grad);
-        ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mrhs_data * mout_grad);
-      });
+      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      CHECK_NE(req_rhs_grad, kWriteInplace);
+      ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad, mlhs_data * mout_grad);
+      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mrhs_data * mout_grad);
+    });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs_grad->shape_, rhs_grad->shape_, &ret_reshaped,
-                     &lhs_broadcast_axis, &rhs_broadcast_axis);
-  index_t lhs_size = lhs_grad->shape_.Size();
-  index_t rhs_size = rhs_grad->shape_.Size();
-
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
   MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      if (lhs_broadcast_axis >= 0) {
-        Tensor<xpu, 1, DType> mlhs_data =
-            lhs.data.get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-        Tensor<xpu, 1, DType> mlhs_grad =
-            lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-
-        if (rhs_broadcast_axis >= 0) {
-          // broadcast both
-          Tensor<xpu, 2, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 2, DType>(ret_reshaped.get<2>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 1, DType> mrhs_data =
-              rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          if (lhs_broadcast_axis == 0) {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<0>(
-                    mout_grad * broadcast<1>(mrhs_data, ret_reshaped.get<2>())));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-                    mout_grad * broadcast<0>(mlhs_data, ret_reshaped.get<2>())));
-          } else {
-            ASSIGN_DISPATCH(
-                mlhs_grad, req_lhs_grad, sumall_except_dim<1>(
-                    mout_grad * broadcast<0>(mrhs_data, ret_reshaped.get<2>())));
-            ASSIGN_DISPATCH(
-                mrhs_grad, req_rhs_grad, sumall_except_dim<0>(
-                    mout_grad * broadcast<1>(mlhs_data, ret_reshaped.get<2>())));
-          }
-        } else {
-          // only broadcast lhs
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mrhs_data =
-              rhs.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-
-          ASSIGN_DISPATCH(
-              mlhs_grad, req_lhs_grad, sumall_except_dim<1>(mout_grad * mrhs_data));
-          if (lhs_size == 1) {
-            ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-                            mout_grad * broadcast_scalar(mlhs_data, ret_reshaped.get<3>()));
-          } else {
-            ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-                            mout_grad * broadcast<1>(mlhs_data, ret_reshaped.get<3>()));
-          }
-        }
-      } else {
-        if (rhs_broadcast_axis >= 0) {
-          // only broadcast rhs
-          Tensor<xpu, 3, DType> mlhs_grad =
-              lhs_grad->get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 3, DType> mlhs_data =
-              lhs.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-          Tensor<xpu, 1, DType> mrhs_grad =
-              rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 1, DType> mrhs_data =
-              rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-          Tensor<xpu, 3, DType> mout_grad =
-              out_grad.data.get_with_shape<xpu, 3, DType>(ret_reshaped.get<3>(), s);
-
-          if (rhs_size == 1) {
-            ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-                            mout_grad * broadcast_scalar(mrhs_data, ret_reshaped.get<3>()));
-          } else {
-            ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-                            mout_grad * broadcast<1>(mrhs_data, ret_reshaped.get<3>()));
-          }
-          ASSIGN_DISPATCH(
-              mrhs_grad, req_rhs_grad, sumall_except_dim<1>(mout_grad * mlhs_data));
-        } else {
-          LOG(FATAL) << "no broadcast is needed";
-        }
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
-    });
+      mshadow::Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        broadcast_to(mlhs_data, out_new_shape_) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) * mout_grad);
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        broadcast_to(mlhs_data, out_new_shape_) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) * mout_grad);
+    }
+  });
 }
 
 template<typename xpu>
@@ -472,122 +343,165 @@ void BroadcastDivBackward_(const OutputGrad& out_grad,
   OpReqType req_lhs_grad,
   OpReqType req_rhs_grad,
   RunContext ctx) {
+  using namespace mshadow;
   using namespace mshadow::expr;
-  using mshadow::Shape;
-  using mshadow::Shape1;
-  using mshadow::Shape2;
-  using mshadow::Tensor;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-
+  Stream<xpu> *s = ctx.get_stream<xpu>();
   if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
     MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-      Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
       CHECK_NE(req_rhs_grad, kWriteInplace);
       ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-                      F<mshadow_op::negation>(mout_grad * mlhs_data)/
-                      F<mshadow_op::square>(mrhs_data));
-      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mout_grad /  mrhs_data);    });
+        F<mshadow_op::negation>(mout_grad * mlhs_data) /
+        F<mshadow_op::square>(mrhs_data));
+      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad, mout_grad / mrhs_data);
+    });
     return;
   }
-
-  TShape ret_reshaped;
-  int lhs_broadcast_axis;
-  int rhs_broadcast_axis;
-  GetBroadcastShape_(lhs_grad->shape_, rhs_grad->shape_, &ret_reshaped,
-    &lhs_broadcast_axis, &rhs_broadcast_axis);
-  index_t lhs_size = lhs_grad->shape_.Size();
-  index_t rhs_size = rhs_grad->shape_.Size();
-
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
   MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
-    if (lhs_broadcast_axis >= 0) {
-      Tensor<xpu, 1, DType> mlhs_data =
-        lhs.data.get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-      Tensor<xpu, 1, DType> mlhs_grad =
-        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_size), s);
-
-      if (rhs_broadcast_axis >= 0) {
-        // broadcast both
-        Shape<2> rshape = ret_reshaped.get<2>();
-        Tensor<xpu, 2, DType> mout_grad =
-          out_grad.data.get_with_shape<xpu, 2, DType>(rshape, s);
-        Tensor<xpu, 1, DType> mrhs_grad =
-          rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        Tensor<xpu, 1, DType> mrhs_data =
-          rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        if (lhs_broadcast_axis == 0) {
-          ASSIGN_DISPATCH(
-            mlhs_grad, req_lhs_grad, sumall_except_dim<0>(
-            mout_grad / broadcast<1>(mrhs_data, rshape)));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-            F<mshadow_op::negation>(mout_grad * broadcast<0>(mlhs_data, rshape)) /
-            F<mshadow_op::square>(broadcast<1>(mrhs_data, rshape))));
-        } else {
-          ASSIGN_DISPATCH(
-            mlhs_grad, req_lhs_grad, sumall_except_dim<1>(
-            mout_grad / broadcast<0>(mrhs_data, rshape)));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<0>(
-            F<mshadow_op::negation>(mout_grad * broadcast<1>(mlhs_data, rshape)) /
-            F<mshadow_op::square>(broadcast<0>(mrhs_data, rshape))));
-        }
-      } else {
-        // only broadcast lhs
-        Shape<3> rshape = ret_reshaped.get<3>();
-        Tensor<xpu, 3, DType> mout_grad =
-          out_grad.data.get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 3, DType> mrhs_grad =
-          rhs_grad->get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 3, DType> mrhs_data =
-          rhs.data.get_with_shape<xpu, 3, DType>(rshape, s);
-
-        ASSIGN_DISPATCH(
-          mlhs_grad, req_lhs_grad, sumall_except_dim<1>(mout_grad / mrhs_data));
-        if (lhs_size == 1) {
-          ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-            F<mshadow_op::negation>(mout_grad * broadcast_scalar(mlhs_data, rshape)) /
-            F<mshadow_op::square>(mrhs_data));
-        } else {
-          ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
-            F<mshadow_op::negation>(mout_grad * broadcast<1>(mlhs_data, rshape)) /
-            F<mshadow_op::square>(mrhs_data));
-        }
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
+      mshadow::Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::negation>(mout_grad * broadcast_to(mlhs_data, out_new_shape_)) /
+        F<mshadow_op::square>(broadcast_to(mrhs_data, out_new_shape_)));
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, mout_grad /
+        broadcast_to(mrhs_data, out_new_shape_));
     } else {
-      if (rhs_broadcast_axis >= 0) {
-        // only broadcast rhs
-        Shape<3> rshape = ret_reshaped.get<3>();
-        Tensor<xpu, 3, DType> mlhs_grad = lhs_grad->get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 3, DType> mlhs_data = lhs.data.get_with_shape<xpu, 3, DType>(rshape, s);
-        Tensor<xpu, 1, DType> mrhs_grad =
-          rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        Tensor<xpu, 1, DType> mrhs_data =
-          rhs.data.get_with_shape<xpu, 1, DType>(Shape1(rhs_size), s);
-        Tensor<xpu, 3, DType> mout_grad =
-          out_grad.data.get_with_shape<xpu, 3, DType>(rshape, s);
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::negation>(mout_grad * broadcast_to(mlhs_data, out_new_shape_)) /
+        F<mshadow_op::square>(broadcast_to(mrhs_data, out_new_shape_)));
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_, mout_grad /
+        broadcast_to(mrhs_data, out_new_shape_));
+    }
+  });
+}
 
-        if (rhs_size == 1) {
-          ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-            mout_grad / broadcast_scalar(mrhs_data, rshape));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-            F<mshadow_op::negation>(mout_grad * mlhs_data) /
-            F<mshadow_op::square>(broadcast_scalar(mrhs_data, rshape))));
-        } else {
-          ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
-            mout_grad / broadcast<1>(mrhs_data, rshape));
-          ASSIGN_DISPATCH(
-            mrhs_grad, req_rhs_grad, sumall_except_dim<1>(
-            F<mshadow_op::negation>(mout_grad * mlhs_data) /
-            F<mshadow_op::square>(broadcast<1>(mrhs_data, rshape))));
-        }
-      } else {
-        LOG(FATAL) << "no broadcast is needed";
+template<typename xpu>
+void BroadcastPowerBackward_(const OutputGrad& out_grad,
+  const Input0& lhs,
+  const Input1& rhs,
+  const EnvArguments& env,
+  TBlob* lhs_grad,
+  TBlob* rhs_grad,
+  OpReqType req_lhs_grad,
+  OpReqType req_rhs_grad,
+  RunContext ctx) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (!IsBroadcastNeeded_(lhs_grad->shape_, rhs_grad->shape_)) {
+    MSHADOW_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
+      mshadow::Tensor<xpu, 2, DType> mout_grad = out_grad.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_data = lhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_data = rhs.data.FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mlhs_grad = lhs_grad->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mrhs_grad = rhs_grad->FlatTo2D<xpu, DType>(s);
+      CHECK_NE(req_rhs_grad, kWriteInplace);
+      ASSIGN_DISPATCH(mrhs_grad, req_rhs_grad,
+        F<mshadow_op::log>(mlhs_data) *
+        F<mshadow_op::power>(mlhs_data, mrhs_data) * mout_grad);
+      ASSIGN_DISPATCH(mlhs_grad, req_lhs_grad,
+        mrhs_data *
+        F<mshadow_op::power>(mlhs_data, mrhs_data - scalar<DType>(1)) *
+        mout_grad);
+    });
+    return;
+  }
+  bool do_opt;
+  TShape lhs_new_shape_, rhs_new_shape_, out_new_shape_;
+  InferBroadcastNewShapes_(&do_opt, &lhs_new_shape_, &rhs_new_shape_, &out_new_shape_,
+    lhs_grad->shape_, rhs_grad->shape_, out_grad.data.shape_);
+  MSHADOW_REAL_TYPE_SWITCH(lhs_grad->type_flag_, DType, {
+    if (do_opt) {
+      Shape<3> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < 3; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
       }
+      mshadow::Tensor<xpu, 3, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, 3, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, 3, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, 3, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, 3, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::log>(broadcast_to(mlhs_data, out_new_shape_)) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+                             broadcast_to(mrhs_data, out_new_shape_)) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+                             broadcast_to(mrhs_data, out_new_shape_) - scalar<DType>(1)) *
+        mout_grad);
+    } else {
+      Shape<MXNET_SPECIAL_MAX_NDIM> lhs_new_shape, rhs_new_shape, out_new_shape;
+      for (index_t i = 0; i < MXNET_SPECIAL_MAX_NDIM; i++) {
+        lhs_new_shape[i] = lhs_new_shape_[i];
+        rhs_new_shape[i] = rhs_new_shape_[i];
+        out_new_shape[i] = out_new_shape_[i];
+      }
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mout_grad =
+        out_grad.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(out_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mlhs_data =
+        lhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(lhs_new_shape, s);
+      mshadow::Tensor<xpu, MXNET_SPECIAL_MAX_NDIM, DType> mrhs_data =
+        rhs.data.get_with_shape<xpu, MXNET_SPECIAL_MAX_NDIM, DType>(rhs_new_shape, s);
+      mshadow::Tensor<xpu, 1, DType> mlhs_grad =
+        lhs_grad->get_with_shape<xpu, 1, DType>(Shape1(lhs_grad->Size()), s);
+      mshadow::Tensor<xpu, 1, DType> mrhs_grad =
+        rhs_grad->get_with_shape<xpu, 1, DType>(Shape1(rhs_grad->Size()), s);
+      ReduceToAssign<red::sum>(mrhs_grad, req_rhs_grad, rhs_new_shape_,
+        F<mshadow_op::log>(broadcast_to(mlhs_data, out_new_shape_)) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+        broadcast_to(mrhs_data, out_new_shape_)) * mout_grad);
+      ReduceToAssign<red::sum>(mlhs_grad, req_lhs_grad, lhs_new_shape_,
+        broadcast_to(mrhs_data, out_new_shape_) *
+        F<mshadow_op::power>(broadcast_to(mlhs_data, out_new_shape_),
+        broadcast_to(mrhs_data, out_new_shape_) - scalar<DType>(1)) *
+        mout_grad);
     }
   });
 }
@@ -623,6 +537,13 @@ MXNET_REGISTER_SIMPLE_OP(broadcast_div, XPU)
 .set_gradient(XPU::kDevMask, BroadcastDivBackward_<XPU>, kNoInplace)
 .describe("lhs divide rhs with broadcast");
 
+MXNET_REGISTER_SIMPLE_OP(broadcast_power, XPU)
+.set_shape_function(BinaryBroadcastShape_)
+.set_function(XPU::kDevMask, BinaryBroadcastForward_<
+              XPU, mshadow_op::power>, kNoInplace, kRegisterSymbolic)
+.set_gradient(XPU::kDevMask, BroadcastPowerBackward_<XPU>, kNoInplace)
+.describe("lhs power rhs with broadcast");
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_ELEMENTWISE_BINARY_BROADCAST_OP_INL_H_
diff --git a/src/operator/elementwise_sum-inl.h b/src/operator/elementwise_sum-inl.h
index b7755485bea9..ebf33f90cc1c 100644
--- a/src/operator/elementwise_sum-inl.h
+++ b/src/operator/elementwise_sum-inl.h
@@ -34,7 +34,7 @@ struct ElementWiseSumParam : public dmlc::Parameter<ElementWiseSumParam> {
   }
 };
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ElementWiseSumOp : public Operator {
  public:
   explicit ElementWiseSumOp(ElementWiseSumParam param)
@@ -52,34 +52,34 @@ class ElementWiseSumOp : public Operator {
     if (req[elemsum::kOut] == kNullOp) return;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> out = out_data[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> out = out_data[elemsum::kOut].FlatTo2D<xpu, DType>(s);
     switch (size_) {
       case 2: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1);
         break;
       }
       case 3: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2);
         break;
       }
       case 4: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, real_t>(s);
-        Tensor<xpu, 2> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_1 = in_data[elemsum::kData1].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_2 = in_data[elemsum::kData2].FlatTo2D<xpu, DType>(s);
+        Tensor<xpu, 2, DType> in_3 = in_data[elemsum::kData3].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], in_0 + in_1 + in_2 + in_3);
         break;
       }
       default: {
-        Tensor<xpu, 2> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, real_t>(s);
+        Tensor<xpu, 2, DType> in_0 = in_data[elemsum::kData0].FlatTo2D<xpu, DType>(s);
         Assign(out, req[elemsum::kOut], F<mshadow_op::identity>(in_0));
         for (int i = 1; i < size_; ++i) {
-          out += in_data[i].FlatTo2D<xpu, real_t>(s);
+          out += in_data[i].FlatTo2D<xpu, DType>(s);
         }
         break;
       }
@@ -97,10 +97,10 @@ class ElementWiseSumOp : public Operator {
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> ograd = out_grad[elemsum::kOut].FlatTo2D<xpu, DType>(s);
     for (int i = 0; i < size_; ++i) {
       if (req[i] == kNullOp || req[i] == kWriteInplace) continue;
-      Tensor<xpu, 2> igrad = in_grad[i].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2, DType> igrad = in_grad[i].FlatTo2D<xpu, DType>(s);
       Assign(igrad, req[i], F<mshadow_op::identity>(ograd));
     }
   }
@@ -120,7 +120,7 @@ class ElementWiseSumOp : public Operator {
 };  // class ElementWiseSumOp
 
 template<typename xpu>
-Operator* CreateOp(ElementWiseSumParam param);
+Operator* CreateOp(ElementWiseSumParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class ElementWiseSumProp : public OperatorProperty {
@@ -155,6 +155,36 @@ class ElementWiseSumProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    size_t nin = in_type->size();
+    CHECK_EQ(nin, static_cast<size_t>(param_.num_args));
+
+    int dtype = -1;
+    for (size_t i = 0; i < nin; ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == dtype ||
+              in_type->at(i) == -1) <<
+              "This operator requires uniform type";
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "At least one input type needs to be known";
+      return false;
+    }
+
+    in_type->clear();
+    for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   std::vector<std::string> ListArguments() const override {
     std::vector<std::string> ret;
     for (int i = 0; i < param_.num_args; ++i) {
@@ -194,7 +224,13 @@ class ElementWiseSumProp : public OperatorProperty {
     return {{in_data[0], out_data[0]}};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   ElementWiseSumParam param_;
diff --git a/src/operator/elementwise_sum.cc b/src/operator/elementwise_sum.cc
index d8546148f76c..fe58cbc0e452 100644
--- a/src/operator/elementwise_sum.cc
+++ b/src/operator/elementwise_sum.cc
@@ -7,13 +7,22 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ElementWiseSumParam param) {
-  return new ElementWiseSumOp<cpu>(param);
+Operator* CreateOp<cpu>(ElementWiseSumParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ElementWiseSumOp<cpu, DType>(param);
+  });
+  return op;
 }
 
 // DO_BIND_DISPATCH comes from static_operator_common.h
-Operator* ElementWiseSumProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* ElementWiseSumProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                               std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ElementWiseSumParam);
diff --git a/src/operator/elementwise_sum.cu b/src/operator/elementwise_sum.cu
index 7a9b443dad82..ae373916b7d4 100644
--- a/src/operator/elementwise_sum.cu
+++ b/src/operator/elementwise_sum.cu
@@ -7,8 +7,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ElementWiseSumParam param) {
-  return new ElementWiseSumOp<gpu>(param);
+Operator* CreateOp<gpu>(ElementWiseSumParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ElementWiseSumOp<gpu, DType>(param);
+  });
+  return op;
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/embedding-inl.h b/src/operator/embedding-inl.h
index 8956a92357d1..fc8b7154fa97 100644
--- a/src/operator/embedding-inl.h
+++ b/src/operator/embedding-inl.h
@@ -22,6 +22,7 @@ namespace op {
 namespace embedding {
 enum EmbeddingOpInputs {kData, kWeight};
 enum EmbeddingOpOutputs {kOut};
+enum EmbeddingOpResource {kTempSpace};
 }  // namespace embedding
 
 struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
@@ -36,7 +37,7 @@ struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
 };
 
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class EmbeddingOp : public Operator {
  public:
   explicit EmbeddingOp(EmbeddingParam p) {
@@ -62,10 +63,10 @@ class EmbeddingOp : public Operator {
     const TShape& oshape = out_data[embedding::kOut].shape_;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> data = in_data[embedding::kData].get_with_shape<xpu, 1, real_t>(
+    Tensor<xpu, 1, DType> data = in_data[embedding::kData].get_with_shape<xpu, 1, DType>(
          Shape1(ishape.ProdShape(0, ishape.ndim())), s);
-    Tensor<xpu, 2> wmat = in_data[embedding::kWeight].get<xpu, 2, real_t>(s);
-    Tensor<xpu, 2> out = out_data[embedding::kOut].get_with_shape<xpu, 2, real_t>(
+    Tensor<xpu, 2, DType> wmat = in_data[embedding::kWeight].get<xpu, 2, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[embedding::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
     out = take(data, wmat);
   }
@@ -89,16 +90,33 @@ class EmbeddingOp : public Operator {
     const TShape& oshape = out_grad[embedding::kOut].shape_;
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 1> data = in_data[embedding::kData].get_with_shape<xpu, 1, real_t>(
+    Tensor<xpu, 1, DType> data = in_data[embedding::kData].get_with_shape<xpu, 1, DType>(
          Shape1(ishape.ProdShape(0, ishape.ndim())), s);
-    Tensor<xpu, 2> grad_out = out_grad[embedding::kOut].get_with_shape<xpu, 2, real_t>(
+    Tensor<xpu, 2, DType> grad_out = out_grad[embedding::kOut].get_with_shape<xpu, 2, DType>(
          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-    Tensor<xpu, 2> grad_in = in_grad[embedding::kWeight].get<xpu, 2, real_t>(s);
-    if (req[embedding::kWeight] == kWriteTo) {
-      grad_in = 0.0f;
-      AddTakeGrad(grad_in, data, grad_out);
-    } else if (req[embedding::kWeight] == kAddTo) {
-      AddTakeGrad(grad_in, data, grad_out);
+    Tensor<xpu, 2, DType> grad_in = in_grad[embedding::kWeight].get<xpu, 2, DType>(s);
+    if (req[embedding::kWeight] == kWriteTo || req[embedding::kWeight] == kAddTo) {
+      if (req[embedding::kWeight] == kWriteTo) {
+#ifdef __CUDACC__
+        cudaMemsetAsync(grad_in.dptr_, 0, grad_in.MSize() * sizeof(DType),
+                        Stream<gpu>::GetStream(s));
+#else
+        grad_in = scalar<DType>(0.0f);
+#endif
+      }
+      if ((grad_out.shape_[0] < grad_out.shape_[1]) && (grad_out.shape_[0] < 512)) {
+        AddTakeGrad(grad_in, data, grad_out);
+      } else {
+        Tensor<xpu, 2, int> workspace =
+          ctx.requested[embedding::kTempSpace].get_space_typed<xpu, 2, int>(
+          mshadow::Shape2(2, data.shape_.Size()), s);
+        Tensor<xpu, 1, int> sorted_data = workspace[0];
+        Tensor<xpu, 1, int> original_index = workspace[1];
+        sorted_data = tcast<int>(data);
+        original_index = range<int>(0, data.shape_.Size());
+        SortByKey(sorted_data, original_index, true);
+        AddTakeGradLargeBatch(grad_in, sorted_data, original_index, grad_out);
+      }
     } else {
       LOG(FATAL) << "wrong req";
     }
@@ -109,7 +127,7 @@ class EmbeddingOp : public Operator {
 };  // class EmbeddingOp
 
 template<typename xpu>
-Operator* CreateOp(EmbeddingParam param);
+Operator* CreateOp(EmbeddingParam param, int dtype);
 
 #if DMLC_USE_CXX11
 class EmbeddingProp : public OperatorProperty {
@@ -146,6 +164,26 @@ class EmbeddingProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto sym = new EmbeddingProp();
     sym->param_ = this->param_;
@@ -163,7 +201,18 @@ class EmbeddingProp : public OperatorProperty {
     return {out_grad[embedding::kOut], in_data[embedding::kData]};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  std::vector<ResourceRequest> BackwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return{ ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   EmbeddingParam param_;
diff --git a/src/operator/embedding.cc b/src/operator/embedding.cc
index c1b7f59edc47..f26b025657fe 100644
--- a/src/operator/embedding.cc
+++ b/src/operator/embedding.cc
@@ -9,12 +9,22 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(EmbeddingParam param) {
-  return new EmbeddingOp<cpu>(param);
+Operator* CreateOp<cpu>(EmbeddingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new EmbeddingOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* EmbeddingProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *EmbeddingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(EmbeddingParam);
diff --git a/src/operator/embedding.cu b/src/operator/embedding.cu
index 7c326ec7f806..4f1d8f8e45ee 100644
--- a/src/operator/embedding.cu
+++ b/src/operator/embedding.cu
@@ -9,8 +9,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(EmbeddingParam param) {
-  return new EmbeddingOp<gpu>(param);
+Operator* CreateOp<gpu>(EmbeddingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new EmbeddingOp<gpu, DType>(param);
+  });
+  return op;
 }
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/matrix_op-inl.h b/src/operator/matrix_op-inl.h
index 1ec78ede6141..8acbdac363b0 100644
--- a/src/operator/matrix_op-inl.h
+++ b/src/operator/matrix_op-inl.h
@@ -27,6 +27,7 @@ struct TransposeParam : public dmlc::Parameter<TransposeParam> {
   }
 };
 
+
 template<typename xpu>
 void TransposeImpl(const TBlob &src,
               TBlob *ret,
@@ -141,6 +142,58 @@ inline TShape TransposeShape(const TShape& shp,
 }
 
 
+struct ExpandDimParam : public dmlc::Parameter<ExpandDimParam> {
+  index_t axis;
+  DMLC_DECLARE_PARAMETER(ExpandDimParam) {
+    DMLC_DECLARE_FIELD(axis)
+    .describe("Position (amongst axes) where new axis is to be inserted.");
+  }
+};
+
+
+inline TShape ExpandDimShape(const TShape& shp,
+                             const EnvArguments& env) {
+  ExpandDimParam param;
+  param.Init(env.kwargs);
+  CHECK_LE(param.axis, shp.ndim())
+      << "axis must be smaller equal to the dimension of the array";
+  std::vector<index_t> idx(shp.data(), shp.data() + shp.ndim());
+  idx.insert(idx.begin() + param.axis, 1);
+  return TShape(idx.begin(), idx.end());
+}
+
+
+template<typename xpu>
+void ReshapeImpl(const TBlob &src,
+                 const EnvArguments& env,
+                 TBlob *ret,
+                 OpReqType req,
+                 RunContext ctx) {
+  if (req == kNullOp) return;
+  if (req == kWriteInplace) {
+    CHECK(ret->CheckContiguous() && src.CheckContiguous());
+  }
+  CHECK_EQ(src.type_flag_, ret->type_flag_);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+      using namespace mshadow::expr;
+      mshadow::Tensor<xpu, 2, DType> out = ret->FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 2, DType> mout = src.get_with_shape<xpu, 2, DType>(out.shape_, s);
+      ASSIGN_DISPATCH(out, req, F<mshadow::op::identity>(mout));
+    });
+}
+
+template<typename xpu>
+void ReshapeGrad_(const OutputGrad& out_grad,
+                  const EnvArguments& env,
+                  TBlob *in_grad,
+                  OpReqType req,
+                  RunContext ctx) {
+  ReshapeImpl<xpu>(
+      out_grad.data, env, in_grad, req, ctx);
+}
+
+
 template<typename xpu>
 void DotForward_(const TBlob& lhs,
                  const TBlob& rhs,
@@ -211,7 +264,6 @@ void DotBackward_(const OutputGrad& out_grad,
   }
 }
 
-
 inline TShape DotShape(const TShape& lshape,
                        const TShape& rshape,
                        const EnvArguments& env) {
@@ -230,6 +282,94 @@ inline TShape DotShape(const TShape& lshape,
   }
 }
 
+template<typename xpu>
+void BatchDotForward_(const TBlob& lhs,
+                        const TBlob& rhs,
+                        const EnvArguments& env,
+                        TBlob *ret,
+                        OpReqType req,
+                        RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, lhs.type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(ret->type_flag_, rhs.type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(ret->type_flag_, mshadow::kFloat32)
+      << "dot only support 32 bit float so far";
+
+  if (lhs.shape_.ndim() == 3 && rhs.shape_.ndim() == 3) {
+    mshadow::Tensor<xpu, 3, real_t> out = ret->get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mlhs = lhs.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mrhs = rhs.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 1, real_t*> workspace =
+      env.resource[0].get_space_typed<xpu, 1, real_t*>(mshadow::Shape1(3 * out.size(0)), s);
+    if (kNullOp != req) {
+      mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, 1.0f,
+                                       (kAddTo == req) ? 1.0f : 0.0f,
+                                       workspace);
+    }
+  } else {
+    LOG(FATAL) << "not reached";
+  }
+}
+
+template<typename xpu>
+void BatchDotBackward_(const OutputGrad& out_grad,
+                         const Input0& lhs,
+                         const Input1& rhs,
+                         const EnvArguments& env,
+                         TBlob* lhs_grad,
+                         TBlob* rhs_grad,
+                         OpReqType req_lhs_grad,
+                         OpReqType req_rhs_grad,
+                         RunContext ctx) {
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_NE(req_rhs_grad, kWriteInplace);
+  CHECK_NE(req_lhs_grad, kWriteInplace);
+
+  if (lhs.data.shape_.ndim() == 3 && rhs.data.shape_.ndim() == 3) {
+    mshadow::Tensor<xpu, 3, real_t> mout_grad = out_grad.data.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mlhs_data = lhs.data.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mrhs_data = rhs.data.get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mlhs_grad = lhs_grad->get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 3, real_t> mrhs_grad = rhs_grad->get<xpu, 3, real_t>(s);
+    mshadow::Tensor<xpu, 2, real_t*> workspace =
+      env.resource[0].get_space_typed<xpu, 2, real_t*>(
+        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
+    mshadow::Tensor<xpu, 1, real_t*> rhs_workspace = workspace[0];
+    mshadow::Tensor<xpu, 1, real_t*> lhs_workspace = workspace[1];
+    if (kNullOp != req_rhs_grad) {
+      mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, 1.0f,
+                                      (kAddTo == req_rhs_grad) ? 1.0f : 0.0f,
+                                      rhs_workspace);
+    }
+    if (kNullOp != req_lhs_grad) {
+      mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, 1.0f,
+                                      (kAddTo == req_lhs_grad) ? 1.0f : 0.0f,
+                                      lhs_workspace);
+    }
+  } else {
+    LOG(FATAL) << "not reached";
+  }
+}
+
+inline TShape BatchDotShape(const TShape& lshape,
+                              const TShape& rshape,
+                              const EnvArguments& env) {
+  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+    CHECK(lshape[0] == rshape[0] && lshape[2] == rshape[1])
+      << "batch_dot shape error: " << lshape << " X " << rshape;
+    size_t target_shape[] = {lshape[0], lshape[1], rshape[2]};
+    return TShape(target_shape, target_shape + 3);
+  } else {
+    LOG(FATAL) << "batch_dot currently only support 3D dot 3D array"
+               << lshape << " v.s. " << rshape;
+    return TShape();
+  }
+}
+
 
 struct SimpleCropParam : public dmlc::Parameter<SimpleCropParam> {
   TShape begin, end;
@@ -241,7 +381,7 @@ struct SimpleCropParam : public dmlc::Parameter<SimpleCropParam> {
   }
 };
 
-// matrix crop
+// matrix crop for multi dimensional cropping: see also slice
 template<typename xpu>
 void Crop(const TBlob &src,
           const EnvArguments& env,
@@ -310,6 +450,121 @@ inline TShape CropShape(const TShape& shp,
 }
 
 
+struct SliceParam : public dmlc::Parameter<SliceParam> {
+  int axis;
+  int begin;
+  int end;
+  DMLC_DECLARE_PARAMETER(SliceParam) {
+    DMLC_DECLARE_FIELD(axis).set_lower_bound(0)
+      .describe("The axis to be sliced");
+    DMLC_DECLARE_FIELD(begin).set_lower_bound(0)
+      .describe("The beginning index to be sliced");
+    DMLC_DECLARE_FIELD(end).set_lower_bound(0)
+      .describe("The end index to be sliced");
+  }
+};
+
+inline TShape SliceShape(const TShape& ishape,
+                         const EnvArguments& env) {
+  SliceParam param;
+  param.Init(env.kwargs);
+  CHECK(param.axis < static_cast<int>(ishape.ndim())) <<
+    "axis must be smaller than the source ndim! Recieved axis=" <<
+      param.axis << ", src_ndim=" << ishape.ndim();
+  int axis_size = static_cast<int>(ishape[param.axis]);
+  CHECK_LE(param.end, axis_size);
+  CHECK_LT(param.begin, param.end);
+
+  std::vector<mshadow::index_t> shape;
+  for (index_t i = 0; i < ishape.ndim(); ++i) {
+    if (static_cast<int>(i) == param.axis) {
+      shape.push_back(static_cast<index_t>(param.end - param.begin));
+    } else {
+      shape.push_back(ishape[i]);
+    }
+  }
+  return TShape(shape.begin(), shape.end());
+}
+
+
+template<typename xpu>
+void Slice(const TBlob &src,
+           const EnvArguments& env,
+           TBlob *ret,
+           OpReqType req,
+           RunContext ctx) {
+  using namespace mshadow::expr;
+  SliceParam param;
+  param.Init(env.kwargs);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  int ndim = static_cast<int>(ret->shape_.ndim());
+
+  if (param.axis + 1 == ndim) {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+        mshadow::Tensor<xpu, 2, DType> in =
+            src.FlatTo2D<xpu, DType>(s);
+        mshadow::Tensor<xpu, 2, DType> out =
+            ret->FlatTo2D<xpu, DType>(s);
+        ASSIGN_DISPATCH(out, req, slice<1>(in, param.begin, param.end));
+      });
+  } else {
+    MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {
+        mshadow::Tensor<xpu, 3, DType> in =
+            src.FlatTo3D<xpu, DType>(param.axis, s);
+        mshadow::Tensor<xpu, 3, DType> out =
+            ret->FlatTo3D<xpu, DType>(param.axis, s);
+        ASSIGN_DISPATCH(out, req, slice<1>(in, param.begin, param.end));
+      });
+  }
+}
+
+// Backward pass of broadcast over the given axis
+template<typename xpu>
+void SliceGrad_(const OutputGrad& out_grad,
+                const EnvArguments& env,
+                TBlob *in_grad,
+                OpReqType req,
+                RunContext ctx) {
+  using namespace mshadow::op;
+  using namespace mshadow::expr;
+  SliceParam param;
+  param.Init(env.kwargs);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  int ndim = static_cast<int>(in_grad->shape_.ndim());
+
+  if (param.axis + 1 == ndim) {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+        mshadow::Tensor<xpu, 2, DType> ograd =
+            out_grad.data.FlatTo2D<xpu, DType>(s);
+        mshadow::Tensor<xpu, 2, DType> igrad =
+            in_grad->FlatTo2D<xpu, DType>(s);
+        if (req == kAddTo) {
+          slice<1>(igrad, param.begin, param.end) += F<identity>(ograd);
+        } else if (req == kWriteTo) {
+          igrad = 0.0f;
+          slice<1>(igrad, param.begin, param.end) = F<identity>(ograd);
+        } else {
+          CHECK_EQ(req, kNullOp);
+        }
+      });
+  } else {
+    MSHADOW_TYPE_SWITCH(in_grad->type_flag_, DType, {
+        mshadow::Tensor<xpu, 3, DType> ograd =
+            out_grad.data.FlatTo3D<xpu, DType>(param.axis, s);
+        mshadow::Tensor<xpu, 3, DType> igrad =
+            in_grad->FlatTo3D<xpu, DType>(param.axis, s);
+        if (req == kAddTo) {
+          slice<1>(igrad, param.begin, param.end) += F<identity>(ograd);
+        } else if (req == kWriteTo) {
+          igrad = 0.0f;
+          slice<1>(igrad, param.begin, param.end) = F<identity>(ograd);
+        } else {
+          CHECK_EQ(req, kNullOp);
+        }
+      });
+  }
+}
+
 struct FlipParam : public dmlc::Parameter<FlipParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(FlipParam) {
@@ -387,22 +642,43 @@ MXNET_REGISTER_SIMPLE_OP(transpose, XPU)
 .set_function(XPU::kDevMask, Transpose<XPU>, kNoInplace, kRegisterSymbolic)
 .set_shape_function(TransposeShape)
 .set_gradient(XPU::kDevMask, TransposeGrad<XPU>, kNoInplace)
-.describe("Transpose the input matrix and return a new one");
+.describe("Transpose the input matrix and return a new one")
+.add_arguments(TransposeParam::__FIELDS__());
+
+// expand_dim
+MXNET_REGISTER_SIMPLE_OP(expand_dims, XPU)
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, ReshapeImpl<XPU>, kInplaceInOut)
+.set_shape_function(ExpandDimShape)
+.set_gradient(XPU::kDevMask, ReshapeGrad_<XPU>, kInplaceOutIn)
+.describe("Expand the shape of array by inserting a new axis.")
+.add_arguments(ExpandDimParam::__FIELDS__());
 
 // crop
 MXNET_REGISTER_SIMPLE_OP(crop, XPU)
 .set_enable_kwargs(true)
 .set_function(XPU::kDevMask, Crop<XPU>, kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(CropShape)
-.describe("Crop the input matrix and return a new one");
+.describe("Crop the input matrix and return a new one")
+.add_arguments(SimpleCropParam::__FIELDS__());
+
+// slice_axis
+MXNET_REGISTER_SIMPLE_OP(slice_axis, XPU)
+.set_enable_kwargs(true)
+.set_function(XPU::kDevMask, Slice<XPU>,
+              kNoInplace, kRegisterSymbolic)
+.set_gradient(XPU::kDevMask, SliceGrad_<XPU>, kNoInplace)
+.set_shape_function(SliceShape)
+.describe("Slice the input along certain axis and return a sliced array.")
+.add_arguments(SliceParam::__FIELDS__());
 
 // flip
 MXNET_REGISTER_SIMPLE_OP(flip, XPU)
 .set_enable_kwargs(true)
 .set_function(XPU::kDevMask, Flip<XPU>, kNoInplace, kNotRegisterSymbolic)
 .set_shape_function(FlipShape)
-.describe("Flip the input matrix along axis and return a new one");
-
+.describe("Flip the input matrix along axis and return a new one")
+.add_arguments(FlipParam::__FIELDS__());
 
 // dot
 MXNET_REGISTER_SIMPLE_OP(dot, XPU)
@@ -410,6 +686,15 @@ MXNET_REGISTER_SIMPLE_OP(dot, XPU)
 .set_shape_function(DotShape)
 .set_gradient(XPU::kDevMask, DotBackward_<XPU>, kNoInplace)
 .describe("Calculate dot product of two matrices or two vectors");
+
+// batched_dot
+MXNET_REGISTER_SIMPLE_OP(batch_dot, XPU)
+.set_function(XPU::kDevMask, BatchDotForward_<XPU>, kNoInplace, kRegisterSymbolic)
+.set_shape_function(BatchDotShape)
+.set_gradient(XPU::kDevMask, BatchDotBackward_<XPU>, kNoInplace)
+.set_resource_request(ResourceRequest::kTempSpace)
+.describe("Calculate batched dot product of two matrices."
+          " (batch, M, K) batch_dot (batch, K, N) --> (batch, M, N)");
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/matrix_op.cc b/src/operator/matrix_op.cc
index ff6d01546497..24b72429a169 100644
--- a/src/operator/matrix_op.cc
+++ b/src/operator/matrix_op.cc
@@ -9,7 +9,9 @@
 namespace mxnet {
 namespace op {
 DMLC_REGISTER_PARAMETER(TransposeParam);
+DMLC_REGISTER_PARAMETER(ExpandDimParam);
 DMLC_REGISTER_PARAMETER(SimpleCropParam);
+DMLC_REGISTER_PARAMETER(SliceParam);
 DMLC_REGISTER_PARAMETER(FlipParam);
 }  // op
 }  // mxnet
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index 2b92b3150a97..70e88331d49a 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -15,6 +15,7 @@ namespace mxnet {
 namespace op {
 
 class SimpleOpPropBase;
+class SimpleSourceOpProp;
 class SimpleUnaryOpProp;
 class SimpleBinaryOpProp;
 
@@ -64,6 +65,12 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
     return *this;
   }
 
+  TSelf& set_shape_function(SourceShapeFunction fshapeinfer) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    source_shape_ = fshapeinfer;
+    return *this;
+  }
+
   TSelf& set_shape_function(UnaryShapeFunction fshapeinfer) override {
     std::lock_guard<std::mutex> lock(mutex_);
     unary_shape_ = fshapeinfer;
@@ -76,6 +83,21 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
     return *this;
   }
 
+  TSelf& set_function(int dev_mask,
+                      SourceFunction fsource,
+                      SimpleOpRegOption register_symbolic) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    SetFunction(&fsource_, dev_mask, fsource, "SourceFunction");
+    if (++reg_counter_ == 1) {
+      this->RegisterSourceImperative();
+      register_symbolic_ = (register_symbolic == kRegisterSymbolic);
+      if (register_symbolic_) {
+        this->RegisterSourceSymbolic();
+      }
+    }
+    return *this;
+  }
+
   TSelf& set_function(int dev_mask,
                       UnaryFunction funary,
                       SimpleOpInplaceOption inplace_in_out,
@@ -178,6 +200,7 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
  protected:
   // make friend with unary op
   friend class SimpleOpPropBase;
+  friend class SimpleSourceOpProp;
   friend class SimpleUnaryOpProp;
   friend class SimpleBinaryOpProp;
   // internal mutex
@@ -196,6 +219,11 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
   bool enable_kwargs_{false};
   // resource requirements
   std::vector<ResourceRequest> resource_requests_;
+  // ------ source functions ----
+  // source shape inference information.
+  SourceShapeFunction source_shape_{nullptr};
+  // source functions on each device mask
+  std::vector<SourceFunction> fsource_;
   // ------ unary functions -----
   // unary shape inference information.
   UnaryShapeFunction unary_shape_{nullptr};
@@ -266,6 +294,10 @@ class SimpleOpRegEntryImpl : public SimpleOpRegEntry {
     }
     return *op_reg_;
   }
+  // register source function.
+  void RegisterSourceImperative();
+  // register source symbolic function.
+  void RegisterSourceSymbolic();
   // register unary function.
   void RegisterUnaryImperative();
   // register unary symbolic function.
@@ -295,6 +327,264 @@ SimpleOpRegistry::~SimpleOpRegistry() {
     delete kv.second;
   }
 }
+
+// base class
+struct SimpleOpScalarParam :
+      public dmlc::Parameter<SimpleOpScalarParam> {
+  float scalar;
+  DMLC_DECLARE_PARAMETER(SimpleOpScalarParam) {
+    DMLC_DECLARE_FIELD(scalar)
+        .describe("scalar value.");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(SimpleOpScalarParam);
+
+class SimpleOpPropBase : public OperatorProperty {
+ public:
+  std::string name;
+  EnvArguments env;
+  SimpleOpRegEntryImpl* source;
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    if (source->enable_kwargs_) {
+      env.kwargs = kwargs;
+    } else if (source->enable_scalar_) {
+      SimpleOpScalarParam param;
+      param.Init(kwargs);
+      env.scalar = param.scalar;
+    } else {
+      CHECK_EQ(kwargs.size(), 0)
+          << "Operator " << source->symbol_name_ << " donot accept any keyword arguments";
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    if (source->enable_kwargs_) {
+      return std::map<std::string, std::string>(
+          env.kwargs.begin(), env.kwargs.end());
+    } else if (source->enable_scalar_) {
+      SimpleOpScalarParam param;
+      param.scalar = env.scalar;
+      return param.__DICT__();
+    } else {
+      return std::map<std::string, std::string>();
+    }
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return source->resource_requests_;
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return source->resource_requests_;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_LE(in_type->size(), this->ListArguments().size());
+    int dtype = -1;
+    // reduce dtype to a common one.
+    for (unsigned i = 0; i < in_type->size(); ++i) {
+      if (dtype == -1) {
+        dtype = in_type->at(i);
+      } else {
+        CHECK(in_type->at(i) == -1 ||
+              in_type->at(i) == dtype) <<
+          "Non-uniform input data type. Expected " << dtype << "got " << in_type->at(i);
+      }
+    }
+
+    if (dtype == -1) {
+      LOG(FATAL) << "At least one input type needs to be specified.";
+      return false;
+    }
+
+    int n_in = this->ListArguments().size();
+    in_type->clear();
+    for (int i = 0; i < n_in; ++i) in_type->push_back(dtype);
+
+    int n_out = this->ListOutputs().size();
+    out_type->clear();
+    for (int i = 0; i < n_out; ++i) out_type->push_back(dtype);
+
+    int n_aux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (int i = 0; i < n_aux; ++i) aux_type->push_back(dtype);
+    return true;
+  }
+
+  std::string TypeString() const override {
+    return name;
+  }
+};
+
+//-------------------------------------
+// source function Implementation
+//-------------------------------------
+void SimpleOpRegEntryImpl::RegisterSourceImperative() {
+  CHECK_EQ(reg_counter_, 1);
+  // The body to be registered
+  auto body = [this] (NDArray** used_vars,
+                      real_t* s,
+                      NDArray** mutate_vars,
+                      int num_params,
+                      char** param_keys,
+                      char** param_vals) {
+    NDArray* out = mutate_vars[0];
+    // setup env.
+    EnvArguments env;
+    if (enable_scalar_) env.scalar = s[0];
+    if (enable_kwargs_) {
+      for (int i = 0; i < num_params; ++i) {
+        env.kwargs.emplace_back(std::make_pair(
+            std::string(param_keys[i]), std::string(param_vals[i])));
+      }
+    } else {
+      CHECK_EQ(num_params, 0)
+        << "operator " << this->name << " do not take keyword arguments";
+    }
+    // shape inference.
+    CHECK(source_shape_ != nullptr);
+    TShape dshape = source_shape_(env);
+    // check output shape.
+    CHECK(!out->is_none());
+    CHECK(out->shape() == dshape) << "target shape mismatch "
+    << out->shape() << " vs. " << dshape;
+
+    // important: callback must always capture by value
+    NDArray ret = *out;
+    // request resources.
+    std::vector<Engine::VarHandle> write_vars = {ret.var()};
+    for (ResourceRequest req : resource_requests_) {
+      env.resource.push_back(ResourceManager::Get()->Request(ret.ctx(), req));
+      write_vars.push_back(env.resource.back().var);
+    }
+    // check if the function exist
+    int dev_mask = ret.ctx().dev_mask();
+    // error message
+    if (static_cast<size_t>(dev_mask) >= fsource_.size() ||
+        fsource_[dev_mask] == nullptr) {
+      if (dev_mask == gpu::kDevMask) {
+        LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+      }
+      LOG(FATAL) << "Function " << this->name
+                 << "not registered for device " << dev_mask;
+    }
+    // invoke the function
+    SourceFunction fun = fsource_[dev_mask];
+    OpReqType req = kWriteTo;
+
+    Engine::Get()->PushSync([ret, fun, dev_mask, req, env](RunContext ctx) {
+        ret.CheckAndAlloc();
+        TBlob tmp = ret.data();
+        (*fun)(env, &tmp, req, ctx);
+#if MXNET_USE_CUDA
+        if (dev_mask == gpu::kDevMask) {
+          ctx.get_stream<gpu>()->Wait();
+        }
+#endif
+      }, ret.ctx(), {}, write_vars);
+  };
+  // register the function.
+  NDArrayReg()
+      .set_body(body)
+      .set_num_use_vars(0)
+      .set_num_mutate_vars(1);
+  if (enable_scalar_) {
+      NDArrayReg()
+          .set_num_scalars(1)
+          .add_argument("scalar", "float", "scalar input to the function");
+  }
+}
+
+// operator to invoke unary function.
+struct SimpleSourceOperator : public Operator {
+  EnvArguments env;
+  SourceFunction forward;
+
+  void Forward(const OpContext &ctx,
+               const std::vector<TBlob> &in_data,
+               const std::vector<OpReqType> &req,
+               const std::vector<TBlob> &out_data,
+               const std::vector<TBlob> &aux_args) override {
+    if (ctx.requested.size() != 0) env.resource = ctx.requested;
+    CHECK_EQ(in_data.size(), 0);
+    CHECK_EQ(out_data.size(), 1);
+    TBlob out = out_data[0];
+    (*forward)(env, &out, req[0], ctx.run_ctx);
+  }
+
+  void Backward(const OpContext &ctx,
+                const std::vector<TBlob> &out_grad,
+                const std::vector<TBlob> &in_data,
+                const std::vector<TBlob> &out_data,
+                const std::vector<OpReqType> &req,
+                const std::vector<TBlob> &in_grad,
+                const std::vector<TBlob> &aux_args) override {
+    LOG(FATAL) << "no gradient can be done";
+    // no nothing.
+  }
+};  // class SimpleUnaryOperator
+
+class SimpleSourceOpProp : public SimpleOpPropBase {
+ public:
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_EQ(in_shape->size(), 0)
+        << in_shape->size();
+    CHECK(source->source_shape_ != nullptr);
+    out_shape->clear();
+    out_shape->push_back((*(source->source_shape_))(env));
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SimpleSourceOpProp();
+    ptr->source = source;
+    ptr->name = name;
+    ptr->env = env;
+    return ptr;
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    size_t dev_mask = ctx.dev_mask();
+    SimpleSourceOperator *op = new SimpleSourceOperator();
+    CHECK(dev_mask < source->fsource_.size() && source->fsource_[dev_mask] != nullptr);
+    op->forward = source->fsource_[dev_mask];
+    op->env = this->env;
+    return op;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {};
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    out_type->clear();
+    out_type->push_back(mshadow::kFloat32);
+    return true;
+  }
+};
+
+void SimpleOpRegEntryImpl::RegisterSourceSymbolic() {
+  // register the operator
+  auto op_factory = [this]() {
+    SimpleSourceOpProp *prop = new SimpleSourceOpProp();
+    prop->name = this->symbol_name_;
+    prop->source = this;
+    return prop;
+  };
+  OpReg()
+      .set_body(op_factory);
+}
+
 //-------------------------------------
 // unary function Implementation
 //-------------------------------------
@@ -457,99 +747,6 @@ struct SimpleUnaryOperator : public Operator {
   }
 };  // class SimpleUnaryOperator
 
-struct SimpleOpScalarParam :
-      public dmlc::Parameter<SimpleOpScalarParam> {
-  float scalar;
-  DMLC_DECLARE_PARAMETER(SimpleOpScalarParam) {
-    DMLC_DECLARE_FIELD(scalar)
-        .describe("scalar value.");
-  }
-};
-
-DMLC_REGISTER_PARAMETER(SimpleOpScalarParam);
-
-class SimpleOpPropBase : public OperatorProperty {
- public:
-  std::string name;
-  EnvArguments env;
-  SimpleOpRegEntryImpl* source;
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    if (source->enable_kwargs_) {
-      env.kwargs = kwargs;
-    } else if (source->enable_scalar_) {
-      SimpleOpScalarParam param;
-      param.Init(kwargs);
-      env.scalar = param.scalar;
-    } else {
-      CHECK_EQ(kwargs.size(), 0)
-          << "Operator " << source->symbol_name_ << " donot accept any keyword arguments";
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    if (source->enable_kwargs_) {
-      return std::map<std::string, std::string>(
-          env.kwargs.begin(), env.kwargs.end());
-    } else if (source->enable_scalar_) {
-      SimpleOpScalarParam param;
-      param.scalar = env.scalar;
-      return param.__DICT__();
-    } else {
-      return std::map<std::string, std::string>();
-    }
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return source->resource_requests_;
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const std::vector<TShape> &in_shape) const override {
-    return source->resource_requests_;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_LE(in_type->size(), this->ListArguments().size());
-    int dtype = -1;
-    // reduce dtype to a common one.
-    for (unsigned i = 0; i < in_type->size(); ++i) {
-      if (dtype == -1) {
-        dtype = in_type->at(i);
-      } else {
-        CHECK(in_type->at(i) == -1 ||
-              in_type->at(i) == dtype) <<
-          "Non-uniform input data type. Expected " << dtype << "got " << in_type->at(i);
-      }
-    }
-
-    if (dtype == -1) {
-      LOG(FATAL) << "At least one input type needs to be specified.";
-      return false;
-    }
-
-    int n_in = this->ListArguments().size();
-    in_type->clear();
-    for (int i = 0; i < n_in; ++i) in_type->push_back(dtype);
-
-    int n_out = this->ListOutputs().size();
-    out_type->clear();
-    for (int i = 0; i < n_out; ++i) out_type->push_back(dtype);
-
-    int n_aux = this->ListAuxiliaryStates().size();
-    aux_type->clear();
-    for (int i = 0; i < n_aux; ++i) aux_type->push_back(dtype);
-    return true;
-  }
-
-  std::string TypeString() const override {
-    return name;
-  }
-};
-
 class SimpleUnaryOpProp : public SimpleOpPropBase {
  public:
   bool InferShape(std::vector<TShape> *in_shape,
@@ -644,11 +841,9 @@ void SimpleOpRegEntryImpl::RegisterUnarySymbolic() {
   };
   OpReg()
       .set_body(op_factory)
-      .add_argument("lhs", "Symbol", "Left symbolic input to the function")
-      .add_argument("rhs", "Symbol", "Left symbolic input to the function");
+      .add_argument("src", "Symbol", "Left symbolic input to the function");
 }
 
-
 //-------------------------------------
 // binary function Implementation
 //-------------------------------------
@@ -933,7 +1128,7 @@ void SimpleOpRegEntryImpl::RegisterBinarySymbolic() {
   OpReg()
       .set_body(op_factory)
       .add_argument("lhs", "Symbol", "Left symbolic input to the function")
-      .add_argument("rhs", "Symbol", "Left symbolic input to the function");
+      .add_argument("rhs", "Symbol", "Right symbolic input to the function");
 }
 
 }  // namespace op
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index ac6190d38c93..f1fda56713e9 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -39,8 +39,8 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
               "This is useful for input with different shape");
 
     DMLC_DECLARE_FIELD(kernel)
-    .set_expect_ndim(2).enforce_nonzero()
-    .describe("pooling kernel size: (y, x)");
+    .enforce_nonzero()
+    .describe("pooling kernel size: (y, x) or (d, y, x)");
 
     DMLC_DECLARE_FIELD(pool_type)
     .add_enum("max", pool_enum::kMaxPooling)
@@ -50,13 +50,12 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
 
     int stride_shape[] = {1, 1};
     DMLC_DECLARE_FIELD(stride).set_default(TShape(stride_shape, stride_shape + 2))
-    .set_expect_ndim(2).enforce_nonzero()
-    .describe("stride: for pooling (y, x)");
+    .enforce_nonzero()
+    .describe("stride: for pooling (y, x) or (d, y, x)");
 
     int pad_shape[] = {0, 0};
     DMLC_DECLARE_FIELD(pad).set_default(TShape(pad_shape, pad_shape + 2))
-    .set_expect_ndim(2)
-    .describe("pad for pooling: (y, x)");
+    .describe("pad for pooling: (y, x) or (d, y, x)");
   }
 };
 
@@ -77,6 +76,9 @@ class PoolingOp : public Operator {
     CHECK_EQ(in_data.size(), 1);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "Not implmented";
+    }
     Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> out = out_data[pool_enum::kOut].get<xpu, 4, real_t>(s);
     mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
@@ -119,6 +121,9 @@ class PoolingOp : public Operator {
     CHECK_EQ(req.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     // TODO(bing): remove pad (0,0)
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "Not implmented";
+    }
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> grad = out_grad[pool_enum::kOut].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> data = in_data[pool_enum::kData].get<xpu, 4, real_t>(s);
@@ -179,22 +184,43 @@ class PoolingProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     CHECK_EQ(in_shape->size(), 1);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_EQ(dshape.ndim(), 4) << \
-                               "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+                               << "Or 5D in (batch, channel, d, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    if (param_.global_pool) {
-      oshape[2] = 1;
-      oshape[3] = 1;
-    } else {
-      oshape[2] = std::min(dshape[2] + 2 * param_.pad[0] - param_.kernel[0] + param_.stride[0] - 1,
-                          dshape[2] + 2 * param_.pad[0] - 1) / param_.stride[0] + 1;
-      oshape[3] = std::min(dshape[3] + 2 * param_.pad[1] - param_.kernel[1] + param_.stride[1] - 1,
-                          dshape[3] + 2 * param_.pad[1] - 1) / param_.stride[1] + 1;
+    if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]
+              && param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+            << "kernel size exceed input";
+        oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / param_.stride[0];
+        oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK(param_.kernel[0] < dshape[2] + 2 * param_.pad[0]
+            && param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]
+            && param_.kernel[2] <= dshape[4] + 2 * param_.pad[2])
+          << "kernel size exceed input";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      } else {
+        oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / param_.stride[0];
+        oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / param_.stride[1];
+        oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / param_.stride[2];
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);
     }
-    CHECK(oshape[2] > 0 && oshape[3] > 0) << "Pooling: kernel size exceed input";
-    out_shape->clear();
-    out_shape->push_back(oshape);
     return true;
   }
 
diff --git a/src/operator/reshape-inl.h b/src/operator/reshape-inl.h
index 4ddc310e46a0..b61224f7ec3b 100644
--- a/src/operator/reshape-inl.h
+++ b/src/operator/reshape-inl.h
@@ -128,7 +128,7 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   }
 };
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class ReshapeOp : public Operator {
  public:
   explicit ReshapeOp(ReshapeParam param) {}  // Do nothing
@@ -145,8 +145,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     if (req[reshape_enum::kOut] == kNullOp) return;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> data = in_data[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> out = out_data[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> data = in_data[reshape_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[reshape_enum::kOut].FlatTo2D<xpu, DType>(s);
     CHECK_EQ(data.CheckContiguous(), true);
     CHECK_EQ(out.CheckContiguous(), true);
     if (data.dptr_ == out.dptr_) return;
@@ -168,8 +168,8 @@ class ReshapeOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2> grad_in = in_grad[reshape_enum::kOut].FlatTo2D<xpu, real_t>(s);
-    Tensor<xpu, 2> grad_out = out_grad[reshape_enum::kData].FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2, DType> grad_in = in_grad[reshape_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad_out = out_grad[reshape_enum::kData].FlatTo2D<xpu, DType>(s);
     CHECK_EQ(grad_out.CheckContiguous(), true);
     CHECK_EQ(grad_in.CheckContiguous(), true);
     if (grad_out.dptr_ == grad_in.dptr_) return;
@@ -179,7 +179,7 @@ class ReshapeOp : public Operator {
 };  // class ReshapeOp
 
 template<typename xpu>
-Operator* CreateOp(ReshapeParam);
+Operator* CreateOp(ReshapeParam, int dtype);
 
 #if DMLC_USE_CXX11
 class ReshapeProp : public OperatorProperty {
@@ -275,6 +275,18 @@ class ReshapeProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new ReshapeProp();
     ptr->param_ = param_;
@@ -306,7 +318,13 @@ class ReshapeProp : public OperatorProperty {
     return {{out_grad[reshape_enum::kOut], in_grad[reshape_enum::kData]}};
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  protected:
   ReshapeParam param_;
diff --git a/src/operator/reshape.cc b/src/operator/reshape.cc
index beee35bb0cf9..ff5579b9b286 100644
--- a/src/operator/reshape.cc
+++ b/src/operator/reshape.cc
@@ -11,12 +11,21 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(ReshapeParam param) {
-  return new ReshapeOp<cpu>(param);
+Operator *CreateOp<cpu>(ReshapeParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ReshapeOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* ReshapeProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* ReshapeProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                        std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(ReshapeParam);
diff --git a/src/operator/reshape.cu b/src/operator/reshape.cu
index 06bbaec1fdfd..ab911749e684 100644
--- a/src/operator/reshape.cu
+++ b/src/operator/reshape.cu
@@ -11,8 +11,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(ReshapeParam param) {
-  return new ReshapeOp<gpu>(param);
+Operator *CreateOp<gpu>(ReshapeParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ReshapeOp<gpu, DType>(param);
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
new file mode 100644
index 000000000000..a70138adb7ce
--- /dev/null
+++ b/src/operator/rnn-inl.h
@@ -0,0 +1,315 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn-inl.h
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+#ifndef MXNET_OPERATOR_RNN_INL_H_
+#define MXNET_OPERATOR_RNN_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace rnn_enum {
+  enum RNNOpInputs {kData, kParams, kState, kStateCell};
+  enum RNNOpOutputs {kOut, kStateOut, kStateCellOut};
+  enum RNNModeType {kRnnRelu, kRnnTanh, kLstm, kGru};
+  enum RNNOpResource {kTempSpace};
+}
+
+// A utility function to calculate input size
+inline int rnn_single_param_size(int inputSize,
+                                int hiddenSize,
+                                int mode) {
+  int size = hiddenSize * (hiddenSize + inputSize + 2);
+  // Different RNN's have different num weights
+  switch (mode) {
+    case rnn_enum::kRnnRelu:
+      size *= 1;
+      break;
+    case rnn_enum::kRnnTanh:
+      size *= 1;
+      break;
+    case rnn_enum::kLstm:
+      size *= 4;
+      break;
+    case rnn_enum::kGru:
+      size *= 3;
+      break;
+  }
+  return size;
+}
+
+inline int rnn_param_size(int layerNum,
+                          int inputSize,
+                          int hiddenSize,
+                          bool bidirectional,
+                          int mode) {
+  // get size of first layer
+  int size = rnn_single_param_size(inputSize, hiddenSize, mode);
+  // get size of remaining layers
+  if (bidirectional) {
+    size += (layerNum - 1) * rnn_single_param_size(2 * hiddenSize, hiddenSize, mode);
+    size *= 2;
+  } else {
+    size += (layerNum - 1) * rnn_single_param_size(hiddenSize, hiddenSize, mode);
+  }
+  return size;
+}
+
+struct RNNParam : public dmlc::Parameter<RNNParam> {
+  uint32_t state_size;
+  uint32_t num_layers;
+  bool bidirectional, state_outputs;
+  int mode;
+  float p, pkeep_;
+  int seq_length_, batch_size_, input_size_;
+  bool lstm_q_;  // whether type is lstm
+
+  DMLC_DECLARE_PARAMETER(RNNParam) {
+    DMLC_DECLARE_FIELD(state_size)
+    .describe("size of the state for each layer");
+
+    DMLC_DECLARE_FIELD(num_layers)
+    .describe("number of stacked layers");
+
+    DMLC_DECLARE_FIELD(bidirectional).set_default(false)
+    .describe("whether to use bidirectional recurrent layers");
+
+    DMLC_DECLARE_FIELD(mode)
+    .add_enum("rnn_relu", rnn_enum::kRnnRelu)
+    .add_enum("rnn_tanh", rnn_enum::kRnnTanh)
+    .add_enum("lstm", rnn_enum::kLstm)
+    .add_enum("gru", rnn_enum::kGru)
+    .describe("the type of RNN to compute");
+
+    DMLC_DECLARE_FIELD(p).set_default(0.)
+    .set_range(0, 1)
+    .describe("Fraction of the input that gets dropped out at training time");
+
+    DMLC_DECLARE_FIELD(state_outputs).set_default(false)
+    .describe("Whether to have the states as symbol outputs.");
+  }
+};
+
+template<typename xpu, typename DType>
+class RNNOp : public Operator {
+ public:
+  explicit RNNOp(RNNParam p) {
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(sbodenstein): add MShadow implementation
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(sbodenstein): add MShadow implementation
+  }
+
+ private:
+  RNNParam param_;
+};  // class RNNOp
+
+template<typename xpu>
+Operator* CreateOp(RNNParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class RNNProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (param_.mode == rnn_enum::kLstm) {
+      return {"data", "parameters", "state", "state_cell"};
+    } else {
+      return {"data", "parameters", "state"};
+    }
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    std::vector<std::string> outputs = {"output"};
+    if (!param_.state_outputs)
+      return outputs;
+    else
+      outputs.push_back("state");
+    if (param_.mode == rnn_enum::kLstm)
+      outputs.push_back("state_cell");
+    return outputs;
+  }
+
+  int NumOutputs() const override {
+    int mode_num = (param_.mode == rnn_enum::kLstm) ? 2 : 1;
+    int num_outputs = param_.state_outputs ? (mode_num + 1) : 1;
+    return num_outputs;
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (param_.mode == rnn_enum::kLstm) {
+      CHECK_EQ(in_shape->size(), 4) << "Input:[data, parameters, state, cell_state]";
+    } else {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, parameters, state]";
+    }
+    const TShape &dshape = (*in_shape)[rnn_enum::kData];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 3) \
+        << "Input data should be rank-3 tensor of dim [sequence length, batch size, input size]";
+    // data: [sequence len, batch, input dimension]
+    int batch_size = dshape[1];
+    int input_size = dshape[2];
+    int numDirections = param_.bidirectional ? 2 : 1;
+    int total_layers = numDirections * param_.num_layers;  // double for bidirectional
+    SHAPE_ASSIGN_CHECK(*in_shape,
+                       rnn_enum::kState,
+                       Shape3(total_layers, batch_size, param_.state_size));
+    if (param_.mode == rnn_enum::kLstm)
+      SHAPE_ASSIGN_CHECK(*in_shape,
+                        rnn_enum::kStateCell,
+                        Shape3(total_layers, batch_size, param_.state_size));
+
+    // calculate parameter vector length
+    int param_size = rnn_param_size(param_.num_layers,
+                                    input_size,
+                                    param_.state_size,
+                                    param_.bidirectional,
+                                    param_.mode);
+    SHAPE_ASSIGN_CHECK(*in_shape, rnn_enum::kParams, Shape1(param_size));
+
+    out_shape->clear();
+    // output: [sequence len, batch, output size]
+    TShape oshape = dshape;
+    oshape[2] = numDirections * param_.state_size;
+    out_shape->push_back(oshape);
+    if (!param_.state_outputs) {
+      return true;
+    } else {
+      // outStateShape: [layer_num, batch, state size]
+      TShape outStateShape = dshape;
+      outStateShape[0] = total_layers;
+      outStateShape[1] = batch_size;
+      outStateShape[2] = param_.state_size;
+      out_shape->push_back(outStateShape);
+      // Deal with lstm cell state
+      if (param_.mode == rnn_enum::kLstm)
+        out_shape->push_back(outStateShape);
+      return true;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    if (!param_.state_outputs) {
+      return true;
+    } else {
+      out_type->push_back(dtype);
+      // Deal with lstm cell state
+      if (param_.mode == rnn_enum::kLstm)
+        out_type->push_back(dtype);
+      return true;
+    }
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new RNNProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "RNN";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    std::vector<int> dep = {in_data[rnn_enum::kData], in_data[rnn_enum::kParams],
+        in_data[rnn_enum::kState], out_data[rnn_enum::kOut], out_grad[rnn_enum::kOut]};
+
+    if (param_.state_outputs) {
+      dep.push_back(out_data[rnn_enum::kStateOut]);
+      dep.push_back(out_grad[rnn_enum::kStateOut]);
+    }
+
+    if (param_.mode == rnn_enum::kLstm) {
+      dep.push_back(in_data[rnn_enum::kStateCell]);
+      if (param_.state_outputs) {
+        dep.push_back(out_data[rnn_enum::kStateCellOut]);
+        dep.push_back(out_grad[rnn_enum::kStateCellOut]);
+      }
+    }
+    return dep;
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  RNNParam param_;
+};  // class RNNProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_RNN_INL_H_
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
new file mode 100644
index 000000000000..3067c8e986c1
--- /dev/null
+++ b/src/operator/rnn.cc
@@ -0,0 +1,42 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cc
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(RNNParam param, int dtype) {
+  LOG(FATAL) << "RNN is only available for gpu at the moment.";
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new RNNOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator *RNNProp::CreateOperatorEx(Context ctx,
+                                  std::vector<TShape> *in_shape,
+                                  std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(RNNParam);
+
+MXNET_REGISTER_OP_PROPERTY(RNN, RNNProp)
+.describe("Apply a recurrent layer to input.")
+.add_argument("data", "Symbol", "Input data to RNN")
+.add_argument("parameters", "Symbol", "Vector of all RNN trainable parameters")
+.add_argument("state", "Symbol", "initial hidden state of the RNN")
+.add_argument("state_cell", "Symbol", "initial cell state for LSTM networks (only for LSTM)")
+.add_arguments(RNNParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
new file mode 100644
index 000000000000..bf914026019d
--- /dev/null
+++ b/src/operator/rnn.cu
@@ -0,0 +1,30 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file rnn.cu
+ * \brief
+ * \author Sebastian Bodenstein
+*/
+
+#include "./rnn-inl.h"
+#include <algorithm>
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+#include "./cudnn_rnn-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(RNNParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNRNNOp<DType>(param);
+  })
+#else
+  LOG(FATAL) << "RNN is only available for cuDNN at the moment.";
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sample_op-inl.h b/src/operator/sample_op-inl.h
new file mode 100644
index 000000000000..41e3c40634ab
--- /dev/null
+++ b/src/operator/sample_op-inl.h
@@ -0,0 +1,112 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op-inl.h
+ * \brief Function defintion sampling operators.
+ */
+#ifndef MXNET_OPERATOR_SAMPLE_OP_INL_H_
+#define MXNET_OPERATOR_SAMPLE_OP_INL_H_
+
+#include <mxnet/operator_util.h>
+#include "./mshadow_op.h"
+
+#if defined(__CUDACC__)
+#define XPU gpu
+#else
+#define XPU cpu
+#endif
+
+namespace mxnet {
+namespace op {
+
+struct SampleUniformParam : public dmlc::Parameter<SampleUniformParam> {
+  float low;
+  float high;
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(SampleUniformParam) {
+    DMLC_DECLARE_FIELD(low).set_default(0.0f)
+        .describe("The lower bound of distribution");
+    DMLC_DECLARE_FIELD(high).set_default(1.0f)
+        .describe("The upper bound of distribution");
+    DMLC_DECLARE_FIELD(shape)
+        .describe("The shape of the output");
+  }
+};
+
+struct SampleNormalParam : public dmlc::Parameter<SampleNormalParam> {
+  float loc;
+  float scale;
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(SampleNormalParam) {
+    DMLC_DECLARE_FIELD(loc).set_default(0.0f)
+        .describe("Mean of the distribution.");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Standard deviation of the distribution.");
+    DMLC_DECLARE_FIELD(shape)
+        .describe("The shape of the output");
+  }
+};
+
+template<typename xpu>
+void SampleUniform_(const EnvArguments& env,
+                    TBlob *ret,
+                    OpReqType req,
+                    RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, mshadow::kFloat32)
+      << "only support float32 rnd so far";
+  SampleUniformParam param;
+  param.Init(env.kwargs);
+  mshadow::Random<xpu, float> *prnd = env.resource[0].get_random<xpu, float>(s);
+  mshadow::Tensor<xpu, 2, float> tmp = ret->FlatTo2D<xpu, float>(s);
+  prnd->SampleUniform(&tmp, float(param.low), float(param.high));  // NOLINT(*)
+}
+
+template<typename xpu>
+void SampleNormal_(const EnvArguments& env,
+                   TBlob *ret,
+                   OpReqType req,
+                   RunContext ctx) {
+  using namespace mxnet::op;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(ret->type_flag_, mshadow::kFloat32)
+      << "only support float32 rnd so far";
+  SampleNormalParam param;
+  param.Init(env.kwargs);
+  mshadow::Random<xpu, float> *prnd = env.resource[0].get_random<xpu, float>(s);
+  mshadow::Tensor<xpu, 2, float> tmp = ret->FlatTo2D<xpu, float>(s);
+  prnd->SampleGaussian(&tmp, float(param.loc), float(param.scale));  // NOLINT(*)
+}
+
+template<typename ParamType>
+inline TShape SampleShape(const EnvArguments& env) {
+  ParamType param;
+  param.Init(env.kwargs);
+  return param.shape;
+}
+
+// sample uniform
+MXNET_REGISTER_SIMPLE_OP(_sample_uniform, XPU)
+.set_symbol_op_name("uniform")
+.set_enable_kwargs(true)
+.set_resource_request(ResourceRequest::kRandom)
+.set_function(XPU::kDevMask, SampleUniform_<XPU>)
+.set_shape_function(SampleShape<SampleUniformParam>)
+.describe("Sample a uniform distribution")
+.add_arguments(SampleUniformParam::__FIELDS__());
+
+// sample normal
+MXNET_REGISTER_SIMPLE_OP(_sample_normal, XPU)
+.set_symbol_op_name("normal")
+.set_enable_kwargs(true)
+.set_resource_request(ResourceRequest::kRandom)
+.set_function(XPU::kDevMask, SampleNormal_<XPU>)
+.set_shape_function(SampleShape<SampleNormalParam>)
+.describe("Sample a normal distribution")
+.add_arguments(SampleNormalParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SAMPLE_OP_INL_H_
diff --git a/src/operator/sample_op.cc b/src/operator/sample_op.cc
new file mode 100644
index 000000000000..7672563648d8
--- /dev/null
+++ b/src/operator/sample_op.cc
@@ -0,0 +1,16 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op.cc
+ * \brief CPU Implementation of sample op
+ */
+// this will be invoked by cc
+#include "./sample_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(SampleUniformParam);
+DMLC_REGISTER_PARAMETER(SampleNormalParam);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/sample_op.cu b/src/operator/sample_op.cu
new file mode 100644
index 000000000000..acc8541b4e4f
--- /dev/null
+++ b/src/operator/sample_op.cu
@@ -0,0 +1,7 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file sample_op.cu
+ * \brief GPU Implementation of sample op
+ */
+// this will be invoked by nvcc and compile GPU version
+#include "./sample_op-inl.h"
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 8bd587bd4270..55532117d835 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -85,8 +85,13 @@ class SoftmaxOutputOp : public Operator {
           out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
       Softmax(out, data);
     } else {
-      Tensor<xpu, 2, DType> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+      int n = in_data[softmaxout_enum::kData].size(0);
+      int k = in_data[softmaxout_enum::kData].Size()/n;
+      Shape<2> s2 = Shape2(n, k);
+      Tensor<xpu, 2, DType> data =
+          in_data[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
+      Tensor<xpu, 2, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
       Softmax(out, data);
     }
   }
@@ -106,11 +111,20 @@ class SoftmaxOutputOp : public Operator {
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
-    if (param_.multi_output) {
+    if (out_data[softmaxout_enum::kOut].shape_ ==
+        in_data[softmaxout_enum::kLabel].shape_) {
+      // use probability as label
+      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+      grad = (out - label) * scalar<DType>(param_.grad_scale);
+    } else if (param_.multi_output) {
       int n = out_data[softmaxout_enum::kOut].size(0);
       int k = out_data[softmaxout_enum::kOut].size(1);
       Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
-      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
+      Shape<2> s2 = Shape2(s3[0], s3[2]);
+      Tensor<xpu, 2, DType> label =
+          in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
       Tensor<xpu, 3, DType> out =
           out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
       Tensor<xpu, 3, DType> grad =
@@ -150,11 +164,14 @@ class SoftmaxOutputOp : public Operator {
         grad *= o_grad;
       }
     } else {
-      const TShape& label_shape = in_data[softmaxout_enum::kLabel].shape_;
+      int n = out_data[softmaxout_enum::kOut].size(0);
+      Shape<2> s2 = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
       Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
-          Shape1(label_shape.ProdShape(0, label_shape.ndim())), s);
-      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+          Shape1(in_data[softmaxout_enum::kLabel].Size()), s);
+      Tensor<xpu, 2, DType> out =
+          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+      Tensor<xpu, 2, DType> grad =
+          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
       index_t valid_cnt = label.shape_.Size();
       if (param_.use_ignore) {
         SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
@@ -212,14 +229,34 @@ class SoftmaxOutputProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
-    if (param_.multi_output) {
-      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel,
-                         Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
-    } else {
-      TShape label_shape(dshape.ndim() - 1);
-      for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
-        label_shape[i] = dshape[i];
-      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+
+    // label.shape == data.shape: use probability as label
+    if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
+      if (param_.multi_output) {
+        TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
+        TShape lshape2(dshape.ndim() - 1);
+        lshape2[0] = dshape[0];
+        for (index_t i = 2; i < dshape.ndim(); ++i)
+          lshape2[i-1] = dshape[i];
+        TShape lshape3 = dshape;
+        lshape3[1] = 1;
+        if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+          in_shape->at(softmaxout_enum::kLabel) = lshape1;
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
+        } else {
+          std::ostringstream os;
+          os << "Expecting " << lshape1 << " or " << lshape2
+             << ". But got " << in_shape->at(softmaxout_enum::kLabel);
+          throw InferShapeError(os.str(), softmaxout_enum::kLabel);
+        }
+      } else {
+        TShape label_shape(dshape.ndim() - 1);
+        for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+          label_shape[i] = dshape[i];
+        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+      }
     }
     out_shape->clear();
     out_shape->push_back(dshape);
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index c4b14e3ac5b4..439a400b4f99 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -32,7 +32,8 @@ DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
 MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
 .describe("Perform a softmax transformation on input, backprop with logloss.")
 .add_argument("data", "Symbol", "Input data to softmax.")
-.add_argument("label", "Symbol", "Label data.")
+.add_argument("label", "Symbol", "Label data, can also be "\
+              "probability value with same shape as data")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
@@ -42,4 +43,3 @@ MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
 
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
new file mode 100644
index 000000000000..74d35ffd7b9e
--- /dev/null
+++ b/src/operator/spatial_transformer-inl.h
@@ -0,0 +1,264 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file spatial_transformer-inl.h
+ * \brief
+ *  Reproducing paper: aderberg M, Simonyan K, Zisserman A. "Spatial transformer networks"
+ * \author Wei Wu
+*/
+#ifndef MXNET_OPERATOR_SPATIAL_TRANSFORMER_INL_H_
+#define MXNET_OPERATOR_SPATIAL_TRANSFORMER_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+namespace st {
+enum SpatialTransformerOpInputs {kData, kLoc};
+enum SpatialTransformerOpOutputs {kOut, kGridDst, kGridSrc};
+enum SpatialTransformerOpResource {kTempSpace};
+enum SpatialTransformerTransformType {kAffine};
+enum SpatialTransformerSamplerType {kBilinear};
+}
+
+struct SpatialTransformerParam : public dmlc::Parameter<SpatialTransformerParam> {
+  TShape target_shape;
+  int transform_type;
+  int sampler_type;
+  DMLC_DECLARE_PARAMETER(SpatialTransformerParam) {
+    int shape[] = {0, 0};
+    DMLC_DECLARE_FIELD(target_shape).set_default(TShape(shape, shape + 2))
+        .describe("output shape(h, w) of spatial transformer: (y, x)");
+    DMLC_DECLARE_FIELD(transform_type).add_enum("affine", st::kAffine)
+        .describe("transformation type");
+    DMLC_DECLARE_FIELD(sampler_type).add_enum("bilinear", st::kBilinear)
+        .describe("sampling type");
+  }
+};
+
+template<typename xpu, typename DType>
+class SpatialTransformerOp : public Operator {
+ public:
+  explicit SpatialTransformerOp(SpatialTransformerParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data = in_data[st::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[st::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grid_dst = out_data[st::kGridDst].get<xpu, 2, DType>(s);
+    Tensor<xpu, 3, DType> grid_src = out_data[st::kGridSrc].get<xpu, 3, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Tensor<xpu, 3, DType> loc = in_data[st::kLoc].get_with_shape<xpu, 3, DType>(loc_shape, s);
+    Tensor<cpu, 2, DType> workspace =
+          ctx.requested[st::kTempSpace].get_host_space_typed<2, DType>(
+          grid_dst.shape_);
+    for (index_t i = 1; i <= workspace.size(1); i++) {
+      // grid dst coordinate is (x, y, 1)
+      workspace[0][i-1] = -1.0 + (i-1) % param_.target_shape[1] * 2.0 /
+                          (param_.target_shape[1] - 1);
+      workspace[1][i-1] = -1.0 + (i-1) / param_.target_shape[1] * 2.0 /
+                          (param_.target_shape[0] - 1);
+      workspace[2][i-1] = 1.0;
+    }
+    Copy(grid_dst, workspace, grid_dst.stream_);
+    for (index_t batch = 0; batch < data.size(0); batch++) {
+        if (param_.transform_type == st::kAffine) {
+          grid_src[batch] = dot(loc[batch], grid_dst);
+        }
+    }
+    if (param_.sampler_type == st::kBilinear) {
+      BilinearSamplingForward(out, data, grid_src);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_data.size(), 3);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data = in_data[st::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> grad = out_grad[st::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> gdata = in_grad[st::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 2, DType> grid_dst = out_data[st::kGridDst].get<xpu, 2, DType>(s);
+    Tensor<xpu, 3, DType> grid_src = out_data[st::kGridSrc].get<xpu, 3, DType>(s);
+    Shape<3> loc_shape = Shape3(data.size(0), 2, 3);
+    Tensor<xpu, 3, DType> gloc = in_grad[st::kLoc].get_with_shape<xpu, 3, DType>(loc_shape, s);
+    gdata = 0.0;
+    if (param_.sampler_type == st::kBilinear) {
+      BilinearSamplingBackward(gdata, grid_src, grad, data);
+    }
+    for (index_t batch = 0; batch < data.size(0); batch++) {
+        if (param_.transform_type == st::kAffine) {
+          gloc[batch] = dot(grid_src[batch], grid_dst.T());
+        }
+    }
+  }
+
+ private:
+  SpatialTransformerParam param_;
+};  // class SpatialTransformerOp
+
+template<typename xpu>
+Operator* CreateOp(SpatialTransformerParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SpatialTransformerProp : public OperatorProperty {
+ public:
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  int NumOutputs() const override {
+    return 3;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+      return {"data", "loc"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "grid_dst", "grid_src"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, loc]";
+    CHECK_EQ(param_.transform_type, st::kAffine) << "only supports affine transform currently";
+    CHECK_EQ(param_.sampler_type, st::kBilinear) << "only supports bilinear sampling currently";
+    const TShape &dshape = (*in_shape)[st::kData];
+    const TShape &lshape = (*in_shape)[st::kLoc];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 4) \
+        << "input data should be 4D in batch-num_filter-y-x";
+    if (lshape.ndim() ==  0) return false;
+    CHECK_EQ(lshape.ndim(), 2) \
+        << "locolisation paramter should be 4D in batch-num_hidden";
+    if (param_.transform_type == st::kAffine) {
+      CHECK_EQ(lshape[1], 6) << "incorrect locolisation network shape[1], should be 6";
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    CHECK_GT(param_.target_shape[0], 0) \
+        << "incorrect target_shape: " << param_.target_shape[0];
+    CHECK_GT(param_.target_shape[1], 0) \
+        << "incorrect target_shape: " << param_.target_shape[1];
+    (*out_shape)[st::kOut][2] = param_.target_shape[0];
+    (*out_shape)[st::kOut][3] = param_.target_shape[1];
+    out_shape->push_back(Shape2(3, param_.target_shape[0]*param_.target_shape[1]));
+    out_shape->push_back(Shape3(dshape[0], 2, param_.target_shape[0]*param_.target_shape[1]));
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                   std::vector<int> *out_type,
+                   std::vector<int> *aux_type) const override {
+      int dtype = -1;
+      for (size_t i = 0; i < in_type->size(); ++i) {
+        if (dtype == -1) {
+          dtype = in_type->at(i);
+        } else {
+          CHECK(in_type->at(i) == dtype ||
+                in_type->at(i) == -1) <<
+                "Non-uniform data type in SpatialTransformer";
+        }
+      }
+      if (dtype == -1) {
+        LOG(FATAL) << "Not enough information to infer type in SpatialTransformer.";
+        return false;
+      }
+      size_t nin = this->ListArguments().size();
+      in_type->clear();
+      for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype);
+      size_t naux = this->ListAuxiliaryStates().size();
+      aux_type->clear();
+      for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype);
+      size_t nout = this->ListOutputs().size();
+      out_type->clear();
+      for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype);
+      return true;
+    }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SpatialTransformerProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SpatialTransformer";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[st::kOut],
+            out_data[st::kGridDst],
+            out_data[st::kGridSrc],
+            in_data[st::kData]
+           };
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  #if CUDNN_MAJOR == 5
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+  #endif
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  SpatialTransformerParam param_;
+};  // class SpatialTransformerProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SPATIAL_TRANSFORMER_INL_H_
diff --git a/src/operator/spatial_transformer.cc b/src/operator/spatial_transformer.cc
new file mode 100644
index 000000000000..de1dc733ef57
--- /dev/null
+++ b/src/operator/spatial_transformer.cc
@@ -0,0 +1,138 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file spatial_transformer.cc
+ * \brief
+ * \author Wei Wu
+*/
+
+#include "./spatial_transformer-inl.h"
+
+namespace mshadow {
+template<typename DType>
+inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
+                                    const Tensor<cpu, 4, DType> &input,
+                                    const Tensor<cpu, 3, DType> grid_src) {
+  DType *out = output.dptr_;
+  const DType *data = input.dptr_;
+  const DType *grid = grid_src.dptr_;
+  int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
+  int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
+  for (index_t n = 0; n < o_n; ++n) {
+    for (index_t c = 0; c < o_c; ++c) {
+      for (index_t h = 0; h < o_h; ++h) {
+        for (index_t w = 0; w < o_w; ++w) {
+          index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+          index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+          DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+          DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
+          index_t top_left_y = std::min(i_h, std::max(0, static_cast<int>(floor(y_real))));
+          index_t top_left_x = std::min(i_w, std::max(0, static_cast<int>(floor(x_real))));
+          DType top_left_y_w = 1.0 - (y_real - top_left_y);
+          DType top_left_x_w = 1.0 - (x_real - top_left_x);
+          index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+          DType top_left_v = *(data + data_index);
+          DType top_right_v = *(data + data_index + 1);
+          DType bottom_left_v = *(data + data_index + i_w);
+          DType bottom_right_v = *(data + data_index + i_w + 1);
+          *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
+                             top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                             bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                             bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+        }
+      }
+    }
+  }
+}
+
+template<typename DType>
+inline void BilinearSamplingBackward(const Tensor<cpu, 4, DType> &input_grad,
+                                     const Tensor<cpu, 3, DType> &grid_src_data,
+                                     const Tensor<cpu, 4, DType> &output_grad,
+                                     const Tensor<cpu, 4, DType> &input_data) {
+  DType *g_input = input_grad.dptr_;
+  DType *grid_src = grid_src_data.dptr_;
+  const DType *grad = output_grad.dptr_;
+  const DType *data = input_data.dptr_;
+  int o_n = output_grad.size(0), o_c = output_grad.size(1),
+      o_h = output_grad.size(2), o_w = output_grad.size(3);
+  int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
+  for (index_t n = 0; n < o_n; ++n) {
+     for (index_t h = 0; h < o_h; ++h) {
+        for (index_t w = 0; w < o_w; ++w) {
+          DType top_left_y_gw = 0.0;
+          DType top_left_x_gw = 0.0;
+          index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+          DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+          DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
+          index_t top_left_y = std::min(i_h, std::max(0, static_cast<int>(floor(y_real))));
+          index_t top_left_x = std::min(i_w, std::max(0, static_cast<int>(floor(x_real))));
+          DType top_left_y_w = 1.0 - (y_real - top_left_y);
+          DType top_left_x_w = 1.0 - (x_real - top_left_x);
+          for (index_t c = 0; c < o_c; ++c) {
+            index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+            index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w
+                                 + top_left_x;
+            // calc 4 vertex value in input data
+            DType top_left_v = *(data + data_index);
+            DType top_right_v = *(data + data_index + 1);
+            DType bottom_left_v = *(data + data_index + i_w);
+            DType bottom_right_v = *(data + data_index + i_w + 1);
+            // calc input grad
+            *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
+            *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w
+                                           * (1.0 - top_left_x_w);
+            *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w)
+                                            * top_left_x_w;
+            *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w)
+                                                * (1.0 - top_left_x_w);
+            // calc weight grad of top_left_w, then multiple -1 is the grad of grid_src
+            top_left_y_gw -= *(grad + grad_index) * (top_right_v - bottom_right_v +
+                             (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
+                             * top_left_x_w);
+            top_left_x_gw -= *(grad + grad_index) * (bottom_left_v - bottom_right_v +
+                             (top_left_v - top_right_v - bottom_left_v + bottom_right_v)
+                             * top_left_y_w);
+          }
+          // calc grid_src grad
+          *(grid_src + grid_src_index + o_h * o_w) = top_left_y_gw * (i_h - 1) / 2;
+          *(grid_src + grid_src_index) = top_left_x_gw * (i_w - 1) / 2;
+        }
+      }
+    }
+  }
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(SpatialTransformerParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SpatialTransformerOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+Operator *SpatialTransformerProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SpatialTransformerParam);
+
+MXNET_REGISTER_OP_PROPERTY(SpatialTransformer, SpatialTransformerProp)
+.add_argument("data", "Symbol", "Input data to the SpatialTransformerOp.")
+.add_argument("loc", "Symbol", "localisation net, the output dim should be 6 when transform_type "
+              "is affine, and the name of loc symbol should better starts with 'stn_loc', so that "
+              "initialization it with iddentify tranform, or you shold initialize the weight and "
+              "bias by yourself.")
+.add_arguments(SpatialTransformerParam::__FIELDS__())
+.describe("Apply spatial transformer to input feature map.");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/spatial_transformer.cu b/src/operator/spatial_transformer.cu
new file mode 100644
index 000000000000..11b4d54bf139
--- /dev/null
+++ b/src/operator/spatial_transformer.cu
@@ -0,0 +1,160 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file spatial_transformer.cu
+ * \brief
+ * \author Wei Wu
+*/
+
+#include "./spatial_transformer-inl.h"
+#include <algorithm>
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+#include "./cudnn_spatial_transformer-inl.h"
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+
+namespace mshadow {
+template<typename DType>
+__global__ void BilinearSamplingForwardKernel(const int i_c, const int i_h,
+                                              const int i_w, const DType* data,
+                                              const DType* grid, const int o_n,
+                                              const int o_c, const int o_h,
+                                              const int o_w, DType* out) {
+  for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+       index < o_n * o_c * o_h * o_w;
+       index += blockDim.x * gridDim.x * gridDim.y) {
+    // (n, c, h, w) is the element in out
+    int w = index % o_w;
+    int h = (index / o_w) % o_h;
+    int c = (index / o_w / o_h) % o_c;
+    int n = index / o_w / o_h / o_c;
+    index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+    index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+    DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+    DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
+    index_t top_left_y = min(i_h, max(0, static_cast<int>(floor(y_real))));
+    index_t top_left_x = min(i_w, max(0, static_cast<int>(floor(x_real))));
+    DType top_left_y_w = 1.0 - (y_real - top_left_y);
+    DType top_left_x_w = 1.0 - (x_real - top_left_x);
+    index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+    DType top_left_v = *(data + data_index);
+    DType top_right_v = *(data + data_index + 1);
+    DType bottom_left_v = *(data + data_index + i_w);
+    DType bottom_right_v = *(data + data_index + i_w + 1);
+    *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
+                       top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                       bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                       bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+    }
+}
+
+template<typename DType>
+__global__ void BilinearSamplingBackwardKernel(const int i_c, const int i_h,
+                                              const int i_w, const DType* grad,
+                                              const DType* data, const int o_n,
+                                              const int o_c, const int o_h,
+                                              const int o_w, DType* g_input,
+                                              DType* grid_src) {
+  for (int index = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
+       index < o_n * o_h * o_w;
+       index += blockDim.x * gridDim.x * gridDim.y) {
+    // (n, c, h, w) is the element in grad
+    int w = index % o_w;
+    int h = (index / o_w) % o_h;
+    int n = index / o_w / o_h;
+    DType top_left_y_gw = 0.0;
+    DType top_left_x_gw = 0.0;
+    index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+    DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+    DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
+    index_t top_left_y = min(i_h, max(0, static_cast<int>(floor(y_real))));
+    index_t top_left_x = min(i_w, max(0, static_cast<int>(floor(x_real))));
+    DType top_left_y_w = 1.0 - (y_real - top_left_y);
+    DType top_left_x_w = 1.0 - (x_real - top_left_x);
+    for (index_t c = 0; c < o_c; ++c) {
+      index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+      index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w + top_left_x;
+      // calc 4 vertex value in input data
+      DType top_left_v = *(data + data_index);
+      DType top_right_v = *(data + data_index + 1);
+      DType bottom_left_v = *(data + data_index + i_w);
+      DType bottom_right_v = *(data + data_index + i_w + 1);
+      // calc input grad
+      *(g_input + data_index) += *(grad + grad_index) * top_left_y_w * top_left_x_w;
+      *(g_input + data_index + 1) += *(grad + grad_index) * top_left_y_w * (1.0 - top_left_x_w);
+      *(g_input + data_index+ i_w) += *(grad + grad_index) * (1.0 - top_left_y_w) * top_left_x_w;
+      *(g_input + data_index+ i_w + 1) += *(grad + grad_index) * (1.0 - top_left_y_w) *
+                                          (1.0 - top_left_x_w);
+      // calc weight grad of top_left_w, then multiple -1 is the grad of grid_src
+      top_left_y_gw -= *(grad + grad_index) * (top_right_v - bottom_right_v +
+                       (top_left_v - top_right_v - bottom_left_v + bottom_right_v) * top_left_x_w);
+      top_left_x_gw -= *(grad + grad_index) * (bottom_left_v - bottom_right_v + (top_left_v -
+                       top_right_v - bottom_left_v + bottom_right_v) * top_left_y_w);
+    }
+    // calc grid_src grad
+    *(grid_src + grid_src_index + o_h * o_w) = top_left_y_gw * (i_h - 1) / 2;
+    *(grid_src + grid_src_index) = top_left_x_gw * (i_w - 1) / 2;
+  }
+}
+
+template<typename DType>
+inline void BilinearSamplingForward(const Tensor<gpu, 4, DType> &output,
+                                    const Tensor<gpu, 4, DType> &input,
+                                    const Tensor<gpu, 3, DType> grid_src) {
+    DType *out = output.dptr_;
+    const DType *data = input.dptr_;
+    const DType *grid = grid_src.dptr_;
+    int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
+    int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
+    using namespace cuda;
+    const int max_block = (output.shape_.Size() + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
+    dim3 num_blocks(kMaxGridNum, (max_block + kMaxGridNum - 1) / kMaxGridNum);
+    dim3 threads_per_block(kMaxThreadsPerBlock);
+    CheckLaunchParam(num_blocks, threads_per_block, "spatial transformer forward");
+    cudaStream_t stream = Stream<gpu>::GetStream(output.stream_);
+    BilinearSamplingForwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
+      i_c, i_h, i_w, data, grid, o_n, o_c, o_h, o_w, out);
+}
+
+template<typename DType>
+inline void BilinearSamplingBackward(const Tensor<gpu, 4, DType> &input_grad,
+                                     const Tensor<gpu, 3, DType> &grid_src_data,
+                                     const Tensor<gpu, 4, DType> &output_grad,
+                                     const Tensor<gpu, 4, DType> &input_data) {
+  DType *g_input = input_grad.dptr_;
+  DType *grid_src = grid_src_data.dptr_;
+  const DType *grad = output_grad.dptr_;
+  const DType *data = input_data.dptr_;
+  int o_n = output_grad.size(0), o_c = output_grad.size(1),
+      o_h = output_grad.size(2), o_w = output_grad.size(3);
+  int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
+  using namespace cuda;
+  const int max_block = (output_grad.shape_.Size() / o_c + kMaxThreadsPerBlock - 1)
+                        / kMaxThreadsPerBlock;
+  dim3 num_blocks(kMaxGridNum, (max_block + kMaxGridNum - 1) / kMaxGridNum);
+  dim3 threads_per_block(kMaxThreadsPerBlock);
+  CheckLaunchParam(num_blocks, threads_per_block, "spatial transformer backward");
+  cudaStream_t stream = Stream<gpu>::GetStream(input_grad.stream_);
+  BilinearSamplingBackwardKernel<DType> << <num_blocks, threads_per_block, 0, stream >> >(
+    i_c, i_h, i_w, grad, data, o_n, o_c, o_h, o_w, g_input, grid_src);
+}
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(SpatialTransformerParam param, int dtype) {
+  Operator *op = NULL;
+#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR == 5
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNSpatialTransformerOp<DType>(param);
+  })
+#else
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SpatialTransformerOp<gpu, DType>(param);
+  })
+#endif  // MXNET_USE_CUDNN && CUDNN_MAJOR
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
new file mode 100644
index 000000000000..1221bf923cda
--- /dev/null
+++ b/src/operator/svm_output-inl.h
@@ -0,0 +1,206 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output-inl.h
+ * \brief
+ * \author Jonas Amaro
+*/
+#ifndef MXNET_OPERATOR_SVM_OUTPUT_INL_H_
+#define MXNET_OPERATOR_SVM_OUTPUT_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <utility>
+#include "./operator_common.h"
+#include "./mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace svm_enum {
+enum SVMOutputOpInputs {kData, kLabel};
+enum SVMOutputOpOutputs {kOut};
+enum SVMOutputNormType {kNull, kBatch, kValid};
+enum SVMOutputOpResource {kTempSpace};
+}  // namespace svm_enum
+
+
+struct SVMOutputParam : public dmlc::Parameter<SVMOutputParam> {
+  float margin;
+  float regularization_coefficient;
+  bool use_linear;
+  DMLC_DECLARE_PARAMETER(SVMOutputParam) {
+    DMLC_DECLARE_FIELD(margin).set_default(1.0f)
+    .describe("Scale the DType(param_.margin) for activation size");
+    DMLC_DECLARE_FIELD(regularization_coefficient).set_default(1.0f)
+    .describe("Scale the coefficient responsible for balacing coefficient size and error tradeoff");
+    DMLC_DECLARE_FIELD(use_linear).set_default(false)
+    .describe("If set true, uses L1-SVM objective function. Default uses L2-SVM objective");
+  };
+};
+
+template<typename xpu, typename DType>
+class SVMOutputOp : public Operator {
+ public:
+  explicit SVMOutputOp(SVMOutputParam param) : param_(param) {}
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2) << "Expecting [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "Expecting [output]";
+    CHECK_EQ(req.size(), 1) << "Expecting output.size() == req.size()";
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, DType> data = in_data[svm_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Assign(out, req[svm_enum::kOut], F<mshadow_op::identity>(data));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_GE(in_grad.size(), 1);
+    CHECK_GE(req.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& label_shape = in_data[svm_enum::kLabel].shape_;
+
+    Tensor<xpu, 1, DType> label = in_data[svm_enum::kLabel].get_with_shape<xpu, 1, DType>(
+        Shape1(label_shape.ProdShape(0, label_shape.ndim())), s);
+    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = in_grad[svm_enum::kData].FlatTo2D<xpu, DType>(s);
+    CHECK_EQ(grad.shape_, out.shape_) << "SVMOutputs: shape mismatch";
+
+    if (param_.use_linear) {
+      L1_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
+    } else {
+      L2_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
+    }
+  }
+
+ private:
+  SVMOutputParam param_;
+};  // class SVMOutputOp
+
+// Declare Factory function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(SVMOutputParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class SVMOutputProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "label"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+    TShape label_shape(dshape.ndim() - 1);
+    for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+      label_shape[i] = dshape[i];
+    SHAPE_ASSIGN_CHECK(*in_shape, svm_enum::kLabel, label_shape);
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new SVMOutputProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "SVMOutput";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {in_data[svm_enum::kLabel], out_data[svm_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{out_data[svm_enum::kOut], in_grad[svm_enum::kData]}};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[svm_enum::kData], out_data[svm_enum::kOut]}};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ protected:
+  SVMOutputParam param_;
+};  // class SVMOutputProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_SVM_OUTPUT_INL_H_
diff --git a/src/operator/svm_output.cc b/src/operator/svm_output.cc
new file mode 100644
index 000000000000..87b9f0a96c28
--- /dev/null
+++ b/src/operator/svm_output.cc
@@ -0,0 +1,82 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output.cc
+ * \brief
+ * \author Jonas Amaro
+*/
+#include "./svm_output-inl.h"
+#include "./mshadow_op.h"
+
+namespace mshadow {
+  template<typename DType>
+  inline void L1_SVM(const DType & margin,
+                     const DType & reg_coef,
+                     Tensor<cpu, 2, DType> dst,
+                     const Tensor<cpu, 1, DType> & label,
+                     const Tensor<cpu, 2, DType> & src) {
+    for (index_t y = 0; y < dst.size(0); y++) {
+      const index_t k = static_cast<int>(label[y]);
+      for (index_t x = 0; x < dst.size(1); x++) {
+        if (x == k) {
+          dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
+        } else {
+          dst[y][x] = DType(margin > -src[y][x]) * reg_coef;
+        }
+      }
+    }
+  }
+
+
+  template<typename DType>
+  inline void L2_SVM(const DType & margin,
+                     const DType & reg_coef,
+                     Tensor<cpu, 2, DType> dst,
+                     const Tensor<cpu, 1, DType> & label,
+                     const Tensor<cpu, 2, DType> & src) {
+    for (index_t y = 0; y < dst.size(0); y++) {
+      const index_t k = static_cast<int>(label[y]);
+      for (index_t x = 0; x < dst.size(1); x++) {
+        if (x == k) {
+          dst[y][k] = margin > src[y][k] ?  2*(margin - src[y][k]) : DType(0.0f);
+          dst[y][k] *= -reg_coef;
+        } else {
+          dst[y][x] = margin > -src[y][x] ? (-2)*(margin + src[y][x]) : DType(0.0f);
+          dst[y][x] *= -reg_coef;
+        }
+      }
+    }
+  }
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(SVMOutputParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SVMOutputOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *SVMOutputProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(SVMOutputParam);
+
+MXNET_REGISTER_OP_PROPERTY(SVMOutput, SVMOutputProp)
+.describe("Support Vector Machine based transformation on input, backprop L2-SVM")
+.add_argument("data", "Symbol", "Input data to svm.")
+.add_argument("label", "Symbol", "Label data.")
+.add_arguments(SVMOutputParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
new file mode 100644
index 000000000000..589eac75f458
--- /dev/null
+++ b/src/operator/svm_output.cu
@@ -0,0 +1,42 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file svm_output.cu
+ * \brief
+ * \author Jonas Amaro
+*/
+
+#include "./svm_output-inl.h"
+
+namespace mshadow {
+  template<typename DType>
+  inline void L1_SVM(const DType & margin,
+                   const DType & reg_coef,
+                   Tensor<gpu, 2, DType> dst,
+                   const Tensor<gpu, 1, DType> & label,
+                   const Tensor<gpu, 2, DType> & src) {
+    LOG(FATAL) << "Not Implemented.";
+  }
+  template<typename DType>
+  inline void L2_SVM(const DType & margin,
+               const DType & reg_coef,
+               Tensor<gpu, 2, DType> dst,
+               const Tensor<gpu, 1, DType> & label,
+               const Tensor<gpu, 2, DType> & src) {
+    LOG(FATAL) << "Not Implemented.";
+  }
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(SVMOutputParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SVMOutputOp<gpu, DType>(param);
+  })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index 6b3e0be2765b..fe301d1d186e 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -40,7 +40,7 @@ struct SwapAxisParam : public dmlc::Parameter<SwapAxisParam> {
 };
 
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class SwapAxisOp : public Operator {
  public:
   explicit SwapAxisOp(SwapAxisParam p) {
@@ -99,12 +99,12 @@ class SwapAxisOp : public Operator {
 
     Reshape2Five(&inter_shape, shape_in, dim1, dim2);
 
-    Tensor<xpu, 5> inter_data_in = data_in.get_with_shape<xpu, 5, real_t>(inter_shape, s);
+    Tensor<xpu, 5, DType> inter_data_in = data_in.get_with_shape<xpu, 5, DType>(inter_shape, s);
 
     Shape<5> inter_shape2 = inter_shape;
     std::swap(inter_shape2[1], inter_shape2[3]);
 
-    Tensor<xpu, 5> inter_data_out = data_out.get_with_shape<xpu, 5, real_t>(inter_shape2, s);
+    Tensor<xpu, 5, DType> inter_data_out = data_out.get_with_shape<xpu, 5, DType>(inter_shape2, s);
 
     inter_data_out = swapaxis<3, 1>(inter_data_in);
   }
@@ -138,7 +138,7 @@ class SwapAxisOp : public Operator {
 
 
 template<typename xpu>
-Operator* CreateOp(SwapAxisParam param);
+Operator* CreateOp(SwapAxisParam param, int dtype);
 
 
 #if DMLC_USE_CXX11
@@ -171,6 +171,17 @@ class SwapAxisProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "Input must have specified type";
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new SwapAxisProp();
     ptr->param_ = param_;
@@ -188,7 +199,13 @@ class SwapAxisProp : public OperatorProperty {
     return {out_grad[swapaxisenum::kOut]};
   };
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   SwapAxisParam param_;
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index 427e83e3619a..d2570da6a400 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -11,12 +11,21 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator* CreateOp<cpu>(SwapAxisParam param) {
-  return new SwapAxisOp<cpu>(param);
+Operator* CreateOp<cpu>(SwapAxisParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SwapAxisOp<cpu, DType>(param);
+  });
+  return op;
 }
 
-Operator* SwapAxisProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* SwapAxisProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                         std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 
diff --git a/src/operator/swapaxis.cu b/src/operator/swapaxis.cu
index c27d3d2f7a1b..93f78c2e733d 100644
--- a/src/operator/swapaxis.cu
+++ b/src/operator/swapaxis.cu
@@ -11,8 +11,12 @@ namespace mxnet {
 namespace op {
 
 template<>
-Operator *CreateOp<gpu>(SwapAxisParam param) {
-  return new SwapAxisOp<gpu>(param);
+Operator *CreateOp<gpu>(SwapAxisParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op =  new SwapAxisOp<gpu, DType>(param);
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index 59a0a5a8a406..743427b7a942 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -62,7 +62,7 @@ struct UpSamplingParam : public dmlc::Parameter<UpSamplingParam> {
   }
 };  // struct UpSamplingParam
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class UpSamplingNearestOp : public Operator {
  public:
   explicit UpSamplingNearestOp(UpSamplingParam p) {
@@ -82,11 +82,11 @@ class UpSamplingNearestOp : public Operator {
       return;
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> out = out_data[up_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> out = out_data[up_enum::kOut].get<xpu, 4, DType>(s);
     if (param_.num_args > 1) {
       int begin = 0;
       for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4> data = in_data[i].get<xpu, 4, real_t>(s);
+        Tensor<xpu, 4, DType> data = in_data[i].get<xpu, 4, DType>(s);
         int end = begin + data.size(1);
         int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2);
         if (param_.multi_input_mode == up_enum::kSum) {
@@ -101,7 +101,7 @@ class UpSamplingNearestOp : public Operator {
         begin = end;
       }
     } else {
-      Tensor<xpu, 4> data = in_data[up_enum::kData].get<xpu, 4, real_t>(s);
+      Tensor<xpu, 4, DType> data = in_data[up_enum::kData].get<xpu, 4, DType>(s);
       Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale));
     }
   }
@@ -118,11 +118,11 @@ class UpSamplingNearestOp : public Operator {
     CHECK_EQ(out_grad.size(), 1);
     CHECK_EQ(in_grad.size(), param_.num_args);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4> grad = out_grad[up_enum::kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4, DType> grad = out_grad[up_enum::kOut].get<xpu, 4, DType>(s);
     if (param_.num_args > 1) {
       int begin = 0;
       for (int i = 0; i < param_.num_args; ++i) {
-        Tensor<xpu, 4> input_grad = in_grad[i].get<xpu, 4, real_t>(s);
+        Tensor<xpu, 4, DType> input_grad = in_grad[i].get<xpu, 4, DType>(s);
         mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
         int end = begin + input_grad.size(1);
         int scale = grad.size(2)/in_shape[0];
@@ -146,7 +146,7 @@ class UpSamplingNearestOp : public Operator {
         begin = end;
       }
     } else {
-      Tensor<xpu, 4> input_grad = in_grad[up_enum::kData].get<xpu, 4, real_t>(s);
+      Tensor<xpu, 4, DType> input_grad = in_grad[up_enum::kData].get<xpu, 4, DType>(s);
       mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]);
       Assign(input_grad, req[up_enum::kData],
              pool<mshadow::red::sum>(grad,
@@ -163,7 +163,7 @@ class UpSamplingNearestOp : public Operator {
 };  // class UpSamplingNearestOp
 
 template<typename xpu>
-Operator *CreateOp(UpSamplingParam param);
+Operator *CreateOp(UpSamplingParam param, int dtype);
 
 
 #if DMLC_USE_CXX11
@@ -232,6 +232,26 @@ class UpSamplingProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new UpSamplingProp();
     ptr->param_ = this->param_;
@@ -279,7 +299,14 @@ class UpSamplingProp : public OperatorProperty {
     }
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
 
  private:
   UpSamplingParam param_;
diff --git a/src/operator/upsampling.cc b/src/operator/upsampling.cc
index d69e7e99c040..77373a820219 100644
--- a/src/operator/upsampling.cc
+++ b/src/operator/upsampling.cc
@@ -11,34 +11,42 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(UpSamplingParam param) {
-  if (param.sample_type == up_enum::kNearest) {
-    return new UpSamplingNearestOp<cpu>(param);
-  } else if (param.sample_type == up_enum::kBilinear) {
-    DeconvolutionParam p = DeconvolutionParam();
-    int kernel = 2 * param.scale - param.scale % 2;
-    int stride = param.scale;
-    int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-    p.workspace = param.workspace;
-    p.num_group = param.num_filter;
-    p.num_filter = param.num_filter;
-    p.no_bias =  true;
-    int shape[] = {1, 1};
-    shape[0] = shape[1] = kernel;
-    p.kernel = TShape(shape, shape + 2);
-    shape[0] = shape[1] = stride;
-    p.stride = TShape(shape, shape + 2);
-    shape[0] = shape[1] = pad;
-    p.pad = TShape(shape, shape + 2);
-    return new DeconvolutionOp<cpu, real_t>(p);
-  } else {
-    LOG(FATAL) << "Unknown sample type";
-    return NULL;
-  }
+Operator *CreateOp<cpu>(UpSamplingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.sample_type == up_enum::kNearest) {
+      op = new UpSamplingNearestOp<cpu, DType>(param);
+    } else if (param.sample_type == up_enum::kBilinear) {
+      DeconvolutionParam p = DeconvolutionParam();
+      int kernel = 2 * param.scale - param.scale % 2;
+      int stride = param.scale;
+      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+      p.workspace = param.workspace;
+      p.num_group = param.num_filter;
+      p.num_filter = param.num_filter;
+      p.no_bias =  true;
+      int shape[] = {1, 1};
+      shape[0] = shape[1] = kernel;
+      p.kernel = TShape(shape, shape + 2);
+      shape[0] = shape[1] = stride;
+      p.stride = TShape(shape, shape + 2);
+      shape[0] = shape[1] = pad;
+      p.pad = TShape(shape, shape + 2);
+      op = new DeconvolutionOp<cpu, DType>(p);
+    } else {
+      LOG(FATAL) << "Unknown sample type";
+    }
+  });
+  return op;
 }
 
-Operator* UpSamplingProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                           std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
 }
 
 DMLC_REGISTER_PARAMETER(UpSamplingParam);
diff --git a/src/operator/upsampling.cu b/src/operator/upsampling.cu
index 526f3a91de84..95864e430010 100644
--- a/src/operator/upsampling.cu
+++ b/src/operator/upsampling.cu
@@ -11,30 +11,33 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(UpSamplingParam param) {
-  if (param.sample_type == up_enum::kNearest) {
-    return new UpSamplingNearestOp<gpu>(param);
-  } else if (param.sample_type == up_enum::kBilinear) {
-    DeconvolutionParam p = DeconvolutionParam();
-    int kernel = 2 * param.scale - param.scale % 2;
-    int stride = param.scale;
-    int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
-    p.workspace = param.workspace;
-    p.num_group = param.num_filter;
-    p.num_filter = param.num_filter;
-    p.no_bias =  true;
-    int shape[] = {1, 1};
-    shape[0] = shape[1] = kernel;
-    p.kernel = TShape(shape, shape + 2);
-    shape[0] = shape[1] = stride;
-    p.stride = TShape(shape, shape + 2);
-    shape[0] = shape[1] = pad;
-    p.pad = TShape(shape, shape + 2);
-    return new DeconvolutionOp<gpu, real_t>(p);
-  } else {
-    LOG(FATAL) << "Unknown sample type";
-    return NULL;
-  }
+Operator *CreateOp<gpu>(UpSamplingParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    if (param.sample_type == up_enum::kNearest) {
+      op = new UpSamplingNearestOp<gpu, DType>(param);
+    } else if (param.sample_type == up_enum::kBilinear) {
+      DeconvolutionParam p = DeconvolutionParam();
+      int kernel = 2 * param.scale - param.scale % 2;
+      int stride = param.scale;
+      int pad = static_cast<int>(ceil((param.scale - 1) / 2.));
+      p.workspace = param.workspace;
+      p.num_group = param.num_filter;
+      p.num_filter = param.num_filter;
+      p.no_bias =  true;
+      int shape[] = {1, 1};
+      shape[0] = shape[1] = kernel;
+      p.kernel = TShape(shape, shape + 2);
+      shape[0] = shape[1] = stride;
+      p.stride = TShape(shape, shape + 2);
+      shape[0] = shape[1] = pad;
+      p.pad = TShape(shape, shape + 2);
+      op = new DeconvolutionOp<gpu, DType>(p);
+    } else {
+      LOG(FATAL) << "Unknown sample type";
+    }
+  });
+  return op;
 }
 
 }  // namespace op
diff --git a/src/resource.cc b/src/resource.cc
index 2ea019f63fd0..bb1842ab83d1 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -24,6 +24,11 @@ struct SpaceAllocator {
   Storage::Handle handle;
   // internal CPU handle
   Storage::Handle host_handle;
+  // The old handles that need to be kept valid
+  // until release is called.
+  // This API allows several CUDA calls using
+  // temp space to get valid space until all the calls finished.
+  std::vector<Storage::Handle> old_handles;
 
   SpaceAllocator() {
     handle.dptr = nullptr;
@@ -33,30 +38,35 @@ struct SpaceAllocator {
   }
 
   inline void Release() {
-    if (handle.size != 0) {
-      Storage::Get()->Free(handle);
-      handle.size = 0;
+    for (const Storage::Handle& handle : old_handles) {
+      if (handle.size != 0) {
+        Storage::Get()->Free(handle);
+      }
     }
+    old_handles.clear();
   }
 
-  inline void ReleaseHost() {
-    if (host_handle.size != 0) {
-      Storage::Get()->Free(host_handle);
-      host_handle.size = 0;
-    }
+  inline void ReleaseAll() {
+    old_handles.push_back(handle);
+    old_handles.push_back(host_handle);
+    this->Release();
+    handle.size = 0;
+    host_handle.size = 0;
   }
 
   inline void* GetSpace(size_t size) {
     if (handle.size >= size) return handle.dptr;
-    this->Release();
-    handle = Storage::Get()->Alloc(size, ctx);
+    old_handles.push_back(handle);
+    handle = Storage::Get()->Alloc(
+        std::max(size, handle.size * 2), ctx);
     return handle.dptr;
   }
 
   inline void* GetHostSpace(size_t size) {
     if (host_handle.size >= size) return host_handle.dptr;
-    this->ReleaseHost();
-    host_handle = Storage::Get()->Alloc(size, Context());
+    old_handles.push_back(host_handle);
+    host_handle = Storage::Get()->Alloc(
+        std::max(size, handle.size * 2), Context());
     return host_handle.dptr;
   }
 };
@@ -203,8 +213,7 @@ class ResourceManagerImpl : public ResourceManager {
         Engine::Get()->DeleteVariable(
             [r](RunContext rctx){
               SpaceAllocator rcpy = r;
-              MSHADOW_CATCH_ERROR(rcpy.Release());
-              MSHADOW_CATCH_ERROR(rcpy.ReleaseHost());
+              MSHADOW_CATCH_ERROR(rcpy.ReleaseAll());
             }, ctx, resource[i].var);
       }
     }
@@ -251,6 +260,10 @@ void* Resource::get_host_space_internal(size_t size) const {
   return static_cast<resource::SpaceAllocator*>(ptr_)->GetHostSpace(size);
 }
 
+void Resource::release() const {
+  return static_cast<resource::SpaceAllocator*>(ptr_)->Release();
+}
+
 ResourceManager* ResourceManager::Get() {
   static resource::ResourceManagerImpl inst;
   return &inst;
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index 6838af037535..2afb658bb9c6 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -8,6 +8,7 @@
 
 #include <dmlc/logging.h>
 #include <cstdlib>
+#include <new>
 #include "mxnet/base.h"
 
 namespace mxnet {
@@ -38,16 +39,15 @@ class CPUDeviceStorage {
 };  // class CPUDeviceStorage
 
 inline void* CPUDeviceStorage::Alloc(size_t size) {
-#if _MSC_VER
   void* ptr;
+#if _MSC_VER
   ptr = _aligned_malloc(size, alignment_);
-  return CHECK_NOTNULL(ptr);
+  if (ptr == NULL) throw std::bad_alloc();
 #else
-  void* ptr;
   int ret = posix_memalign(&ptr, alignment_, size);
-  CHECK_EQ(ret, 0) << "Allocation failed";
-  return ptr;
+  if (ret != 0) throw std::bad_alloc();
 #endif
+  return ptr;
 }
 
 inline void CPUDeviceStorage::Free(void* ptr) {
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index f92918ac7dc9..10684905a861 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -11,6 +11,7 @@
 #if MXNET_USE_CUDA
 #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
+#include <new>
 
 namespace mxnet {
 namespace storage {
@@ -36,7 +37,9 @@ class GPUDeviceStorage {
 inline void* GPUDeviceStorage::Alloc(size_t size) {
   void* ret = nullptr;
 #if MXNET_USE_CUDA
-  CUDA_CALL(cudaMalloc(&ret, size));
+  cudaError_t e = cudaMalloc(&ret, size);
+  if (e != cudaSuccess && e != cudaErrorCudartUnloading)
+    throw std::bad_alloc();
 #else   // MXNET_USE_CUDA
   LOG(FATAL) << "Please compile with CUDA enabled";
 #endif  // MXNET_USE_CUDA
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index 7d3c0dcb7802..5fcf781a67f0 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -10,6 +10,7 @@
 #include <unordered_map>
 #include <vector>
 #include <mutex>
+#include <new>
 #include "./storage_manager.h"
 
 namespace mxnet {
@@ -18,7 +19,7 @@ namespace storage {
 /*!
  * \brief Storage manager with a memory pool.
  */
-template <class DeviceStorage, size_t kThreshold>
+template <class DeviceStorage>
 class PooledStorageManager final : public StorageManager {
  public:
   /*!
@@ -45,16 +46,21 @@ class PooledStorageManager final : public StorageManager {
   DISALLOW_COPY_AND_ASSIGN(PooledStorageManager);
 };  // class PooledStorageManager
 
-template <class DeviceStorage, size_t kThreshold>
-void* PooledStorageManager<DeviceStorage, kThreshold>::Alloc(size_t size) {
+template <class DeviceStorage>
+void* PooledStorageManager<DeviceStorage>::Alloc(size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
-    if (kThreshold <= used_memory_) {
-      ReleaseAll();
-    }
     used_memory_ += size;
-    return DeviceStorage::Alloc(size);
+    for (int i = 0; i < 2; ++i) {
+      try {
+        return DeviceStorage::Alloc(size);
+      } catch (const std::bad_alloc& e) {
+        ReleaseAll();
+      }
+    }
+    LOG(FATAL) << "Memory allocation failed.";
+    return NULL;
   } else {
     auto&& reuse_pool = reuse_it->second;
     auto ret = reuse_pool.back();
@@ -63,16 +69,15 @@ void* PooledStorageManager<DeviceStorage, kThreshold>::Alloc(size_t size) {
   }
 }
 
-template <class DeviceStorage, size_t kThreshold>
-void PooledStorageManager<DeviceStorage, kThreshold>::Free(void* ptr,
-                                                           size_t size) {
+template <class DeviceStorage>
+void PooledStorageManager<DeviceStorage>::Free(void* ptr, size_t size) {
   std::lock_guard<std::mutex> lock(mutex_);
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(ptr);
 }
 
-template <class DeviceStorage, size_t kThreshold>
-void PooledStorageManager<DeviceStorage, kThreshold>::ReleaseAll() {
+template <class DeviceStorage>
+void PooledStorageManager<DeviceStorage>::ReleaseAll() {
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
       DeviceStorage::Free(j);
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 4aeebd5681b7..79cc06c8dc0b 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -25,13 +25,12 @@ class StorageImpl : public Storage {
   virtual ~StorageImpl() = default;
 
  private:
-  static constexpr size_t kPoolThreshold = 4096 * 1024 * 1024ul;
   static constexpr size_t kMaxNumberOfDevices = Context::kMaxDevType + 1;
   static constexpr size_t kMaxNumberOfDeviceIDs = Context::kMaxDevID + 1;
 
   template <class DeviceStorage>
   using CurrentStorageManager =
-      storage::PooledStorageManager<DeviceStorage, kPoolThreshold>;
+      storage::PooledStorageManager<DeviceStorage>;
 
   static void ActivateDevice(Context ctx) {
     switch (ctx.dev_type) {
@@ -87,13 +86,13 @@ Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
 void StorageImpl::Free(Storage::Handle handle) {
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
-  storage::StorageManager *maneger = device.Get(
+  storage::StorageManager *manager = device.Get(
       ctx.dev_id, []() {
         LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
         return nullptr;
       });
   this->ActivateDevice(ctx);
-  maneger->Free(handle.dptr, handle.size);
+  manager->Free(handle.dptr, handle.size);
 }
 
 std::shared_ptr<Storage> Storage::_GetSharedRef() {
diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc
index b990a36e86f5..4ea774829e2c 100644
--- a/src/symbol/graph_executor.cc
+++ b/src/symbol/graph_executor.cc
@@ -6,6 +6,7 @@
 #include <dmlc/logging.h>
 #include <mxnet/resource.h>
 #include <mxnet/symbolic.h>
+#include <dmlc/timer.h>
 #include <memory>
 #include <map>
 #include <set>
@@ -288,6 +289,11 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) {
 
 GraphExecutor::~GraphExecutor() {
   Engine::Get()->WaitForAll();
+  for (auto item : cached_seg_opr_) {
+    if (item.opr != nullptr) {
+      Engine::Get()->DeleteOperator(item.opr);
+    }
+  }
   // need to delete the operators before delete the NDArray they referenced.
   for (OpNode& node : op_nodes_) {
     node.DeleteOperator();
@@ -785,7 +791,7 @@ void GraphExecutor::InitResources() {
   }
 }
 
-void GraphExecutor::InitOpNodes() {
+void GraphExecutor::InitOperators() {
   for (size_t i = 0; i < topo_order_.size(); ++i) {
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
@@ -805,6 +811,15 @@ void GraphExecutor::InitOpNodes() {
           graph_.nodes[graph_.nodes[nid].backward_source_id].op.get(),
           op_nodes_[graph_.nodes[nid].backward_source_id].op));
     }
+  }
+}
+
+void GraphExecutor::InitCachedOps() {
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& op_node = op_nodes_[nid];
     bool allow_cache = true;
     for (StaticGraph::DataEntry e : graph_.nodes[nid].inputs) {
       DataEntryInfo& info = op_nodes_[e.source_id].outputs[e.index];
@@ -824,8 +839,77 @@ void GraphExecutor::InitOpNodes() {
   }
 }
 
+void GraphExecutor::InitOpSegs() {
+  // heurestic to enable bulk execution.
+  cached_seg_opr_.clear();
+  CachedSegOpr p;
+  p.opr = nullptr;
+  cached_seg_opr_.resize(topo_order_.size(), p);
+
+  if (!prefer_bulk_execution_) return;
+  if (num_forward_nodes_ == topo_order_.size()) {
+    cached_seg_opr_[0] = this->CreateCachedSegOpr(0, topo_order_.size());
+    return;
+  }
+  int num_cseg = 0;
+  // normal procedure
+  for (size_t i = 0; i < topo_order_.size(); ++i) {
+    size_t j = i;
+    int hit_count = 0;
+    for (; j < topo_order_.size(); ++j) {
+      if (j == num_forward_nodes_) break;
+      uint32_t nid = topo_order_[j];
+      const OpNode& op_node = op_nodes_[nid];
+      const StaticGraph::Node& gnode = graph_.nodes[nid];
+      if (!op_node.activated) continue;
+      if (graph_.nodes[nid].is_variable()) continue;
+      if (op_node.op->exec_type() != Operator::kSync) break;
+      bool hit = false, tobind = false;
+
+      for (const DataEntryInfo& out : op_node.outputs) {
+        if (out.type == kBindByExternal) hit = true;
+      }
+      const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
+      for (size_t i = 0; i < ninput; ++i) {
+        const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i];
+        const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        if (info.type == kBindByExternal) hit = true;
+        if (info.type == kTobeBindByExternal) tobind = true;
+      }
+      if (hit) ++hit_count;
+      if (tobind) break;
+      // if encounter consecutive 3 blocks containing parameters, use as segment.
+      // this usually means conv-relu-bn
+      const int kHitMaxMagic = 2;
+      if (hit_count > kHitMaxMagic) break;
+    }
+    if (j > i + 1) {
+      cached_seg_opr_[i] = CreateCachedSegOpr(i, j);
+      ++num_cseg;
+      i = j - 1;
+    }
+  }
+}
+
 void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
   for (size_t i = topo_start; i < topo_end; ++i) {
+    uint32_t nid = topo_order_[i];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    OpNode& opnode = op_nodes_[nid];
+    opnode.op_ctx.is_train = is_train;
+  }
+
+  for (size_t i = topo_start; i < topo_end; ++i) {
+    if (!monitor_callback_) {
+      auto seg_op = cached_seg_opr_[i];
+      if (seg_op.opr != nullptr && seg_op.topo_end <= topo_end) {
+        Engine::Get()->Push(seg_op.opr, seg_op.ctx);
+        i = seg_op.topo_end - 1;
+        continue;
+      }
+    }
+
     uint32_t nid = topo_order_[i];
     if (!op_nodes_[nid].activated) continue;
     if (graph_.nodes[nid].is_variable()) continue;
@@ -839,7 +923,6 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
                  &(opnode.outputs[0].data));
       continue;
     }
-    opnode.op_ctx.is_train = is_train;
     if (opnode.cached_opr != nullptr) {
       Engine::Get()->Push(opnode.cached_opr, opnode.ctx);
     } else {
@@ -943,6 +1026,128 @@ void GraphExecutor::Backward(const std::vector<NDArray> &head_grads) {
   RunOps(true, num_forward_nodes_, topo_order_.size());
 }
 
+GraphExecutor::CachedSegOpr
+GraphExecutor::CreateCachedSegOpr(size_t topo_start, size_t topo_end) {
+  std::vector<Engine::VarHandle> read_vars;
+  std::vector<Engine::VarHandle> write_vars;
+  Context *pctx = nullptr;
+  CachedSegOpr ret;
+  ret.topo_begin = topo_start;
+  ret.topo_end = topo_end;
+  ret.opr = nullptr;
+  for (size_t k = topo_start; k < topo_end; ++k) {
+    uint32_t nid = topo_order_[k];
+    OpNode& op_node = op_nodes_[nid];
+    const StaticGraph::Node& gnode = graph_.nodes[nid];
+    if (!op_nodes_[nid].activated) continue;
+    if (graph_.nodes[nid].is_variable()) continue;
+    if (op_node.op->exec_type() != Operator::kSync) return ret;
+    if (pctx == nullptr) pctx = &(op_node.ctx);
+    if (*pctx != op_node.ctx) {
+      return ret;
+    }
+    // AddTO: index is used to store in-place add resources.
+    const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
+
+    for (const DataEntryInfo& out : op_node.outputs) {
+      if (out.type == kTobeBindByExternal) return ret;
+      write_vars.push_back(out.data.var());
+    }
+
+    for (const DataEntryInfo& aux : op_node.aux_states) {
+      if (aux.type == kTobeBindByExternal) return ret;
+      write_vars.push_back(aux.data.var());
+    }
+    for (size_t i = 0; i < ninput; ++i) {
+      const StaticGraph::DataEntry& e = gnode.inputs[i];
+      const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+      if (info.type == kTobeBindByExternal) return ret;
+      read_vars.push_back(info.data.var());
+    }
+    for (const Resource& r : op_node.op_ctx.requested) {
+      write_vars.push_back(r.var);
+    }
+  }
+  if (pctx == nullptr) return ret;
+  ret.ctx = *pctx;
+  // deduplication
+  std::sort(write_vars.begin(), write_vars.end());
+  write_vars.resize(std::unique(write_vars.begin(), write_vars.end()) -
+                    write_vars.begin());
+  std::sort(read_vars.begin(), read_vars.end());
+  read_vars.resize(std::unique(read_vars.begin(), read_vars.end()) -
+                   read_vars.begin());
+  auto wit = write_vars.begin();
+  auto rtop = read_vars.begin();
+  for (auto rit = read_vars.begin(); rit != read_vars.end(); ++rit) {
+    while (wit != write_vars.end() && *wit < *rit) ++wit;
+    if (*wit != *rit) {
+      *rtop = *rit;
+      ++rtop;
+    }
+  }
+  read_vars.resize(rtop - read_vars.begin());
+  bool is_gpu = pctx->dev_mask() == gpu::kDevMask;
+  auto exec_fun = [this, topo_start, topo_end, is_gpu]
+      (RunContext ctx, Engine::CallbackOnComplete on_complete) {
+    std::vector<OpReqType> req;
+    std::vector<TBlob> in_data, out_data, aux_data;
+    for (size_t k = topo_start; k < topo_end; ++k) {
+      uint32_t nid = topo_order_[k];
+      if (!op_nodes_[nid].activated) continue;
+      if (graph_.nodes[nid].is_variable()) continue;
+      OpNode& op_node = op_nodes_[nid];
+      const StaticGraph::Node& gnode = graph_.nodes[nid];
+      CHECK_NE(op_node.op->exec_type(), Operator::kCrossDeviceCopy);
+      CHECK_NE(op_node.op->exec_type(), Operator::kAsync);
+      // AddTO: index is used to store in-place add resources.
+      const size_t ninput = gnode.inputs.size() - gnode.addto_index.size();
+      req.clear();
+      in_data.clear();
+      out_data.clear();
+      aux_data.clear();
+      for (const DataEntryInfo& out : op_node.outputs) {
+        req.push_back(out.op_req);
+        out_data.push_back(out.data.data());
+      }
+      for (size_t i = 0; i < gnode.addto_index.size(); ++i) {
+        CHECK_EQ(req[gnode.addto_index[i]], kWriteInplace);
+        req[gnode.addto_index[i]] = kAddTo;
+        const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i + ninput];
+        const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        CHECK_EQ(info.inplace_op_id, static_cast<int>(nid));
+      }
+      // aux
+      for (const DataEntryInfo& aux : op_node.aux_states) {
+        aux_data.push_back(aux.data.data());
+      }
+      // input
+      for (size_t i = 0; i < ninput; ++i) {
+        const StaticGraph::DataEntry& e = graph_.nodes[nid].inputs[i];
+        const DataEntryInfo &info = op_nodes_[e.source_id].outputs[e.index];
+        in_data.push_back(info.data.data());
+      }
+      // run the function.
+      Operator* op = op_node.op.get();
+      OpContext* op_ctx_ptr = &op_node.op_ctx;
+      op_ctx_ptr->run_ctx = ctx;
+      op->Forward(*op_ctx_ptr, in_data, req, out_data, aux_data);
+    }
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      // Wait GPU kernel to finish.
+      ctx.get_stream<gpu>()->Wait();
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    }
+    on_complete();
+  };
+  ret.opr =  Engine::Get()->NewOperator(
+      exec_fun, read_vars, write_vars, FnProperty::kNormal);
+  return ret;
+}
+
 Executor *Executor::Bind(Symbol symbol,
                          const Context& default_ctx,
                          const std::map<std::string, Context>& group2ctx,
diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h
index c3e1ccb86c7c..a2b5f054721e 100644
--- a/src/symbol/graph_executor.h
+++ b/src/symbol/graph_executor.h
@@ -46,6 +46,7 @@ class GraphExecutor : public Executor {
                    const std::vector<NDArray> &aux_states,
                    Executor* shared_exec = nullptr) {
     enable_inplace_allocation_ = dmlc::GetEnv("MXNET_EXEC_ENABLE_INPLACE", true);
+    prefer_bulk_execution_ = dmlc::GetEnv("MXNET_EXEC_PREFER_BULK_EXEC", true);
     if (shared_exec != NULL) {
       GraphExecutor* gexec = dynamic_cast<GraphExecutor*>(shared_exec);
       CHECK(gexec) << "Input executor for sharing memory must have GraphExecutor type.";
@@ -63,9 +64,11 @@ class GraphExecutor : public Executor {
                     in_args, arg_grad_store, grad_req_type,
                     need_backward);
     this->InitDataEntryInfo(in_args, arg_grad_store, grad_req_type, aux_states);
+    this->InitOperators();
     this->InitDataEntryMemory();
     this->InitResources();
-    this->InitOpNodes();
+    this->InitCachedOps();
+    this->InitOpSegs();
   }
 
  protected:
@@ -156,6 +159,17 @@ class GraphExecutor : public Executor {
       }
     }
   };
+  // a cached segment operator that executes a segment
+  struct CachedSegOpr {
+    // context of the operator
+    Context ctx;
+    // begin in topo order
+    size_t topo_begin;
+    // end in topo order
+    size_t topo_end;
+    // the cached operator
+    Engine::OprHandle opr;
+  };
   /*!
    * \brief Get input option of a node.
    *  This function is overriden for both Forward and Backward node.
@@ -191,6 +205,14 @@ class GraphExecutor : public Executor {
    * \return the execution entry.
    */
   inline OpExecEntry GetOpExecEntry(uint32_t node_id);
+  /*!
+   * \brief Try to create a cached operator to run segments between start and end
+   * \param topo_start beginning of segment
+   * \param topo_end end of segment
+   * \return the cached operator.
+   * The ret.opr can be nullptr if tyhe creation failed
+   */
+  CachedSegOpr CreateCachedSegOpr(size_t topo_start, size_t topo_end);
   // initialize the internal graph structure
   void InitGraph(const Symbol &symbol,
                  const Context& default_ctx,
@@ -209,7 +231,11 @@ class GraphExecutor : public Executor {
   // initialize the internal resources for each op
   void InitResources();
   // initialize OpNode data structure
-  void InitOpNodes();
+  void InitOperators();
+  // initialize OpNode data structure
+  void InitCachedOps();
+  // initialize segments of code to run together as a group.
+  void InitOpSegs();
   // assign context to the graph, this will mutate the graph.
   void AssignContext(const Context default_ctx,
                      const std::map<std::string, Context>& ctx_map,
@@ -232,6 +258,8 @@ class GraphExecutor : public Executor {
   size_t total_allocated_temp_;
   // number of forward nodes in the graph
   size_t num_forward_nodes_;
+  // whether to enable bulk execution
+  bool prefer_bulk_execution_;
   // head gradient node in the graph, if there is backward pass
   std::vector<uint32_t> head_grad_nodes_;
   // mirror map of nodes, experimental feature, normally can be ignored.
@@ -246,6 +274,8 @@ class GraphExecutor : public Executor {
   std::shared_ptr<GraphStoragePool> shared_mem_;
   // monitor call back
   std::function<void(const char*, void*)> monitor_callback_;
+  // cached segment operator
+  std::vector<CachedSegOpr> cached_seg_opr_;
 };  // class GraphExecutor
 }  // namespace mxnet
 #endif  // MXNET_SYMBOL_GRAPH_EXECUTOR_H_
diff --git a/src/symbol/symbol.cc b/src/symbol/symbol.cc
index 12978e064b9f..01614a47d86a 100644
--- a/src/symbol/symbol.cc
+++ b/src/symbol/symbol.cc
@@ -63,6 +63,39 @@ struct Symbol::Node {
       attr.reset(new std::map<std::string, std::string>(*(other.attr)));
     }
   }
+  ~Node() {
+    if (inputs.size() != 0 || backward_source_node.get() != nullptr) {
+      // explicit destructor to resolve problem of hell
+      // of stack overflow caused by recursive deletion chain
+      // run a DFS to explicit store to be deleted node on to_delete
+      std::vector<std::shared_ptr<Symbol::Node> > to_delete;
+      std::vector<Symbol::Node*> stack{this};
+
+      while (!stack.empty()) {
+        Node *n = stack.back();
+        stack.pop_back();
+
+        for (DataEntry& e : n->inputs) {
+          // if the ref is the only reference
+          // the target node need to be deleted
+          if (e.source.unique()) {
+            stack.push_back(e.source.get());
+            to_delete.emplace_back(std::move(e.source));
+          } else {
+            // otherwise, reset the shared_ptr won't trigger destructor.
+            e.source.reset();
+          }
+        }
+        if (n->backward_source_node.unique()) {
+          stack.push_back(n->backward_source_node.get());
+          to_delete.emplace_back(std::move(n->backward_source_node));
+        } else {
+          n->backward_source_node.reset();
+        }
+        n->inputs.clear();
+      }
+    }
+  }
   /*! \return Whether the symbol is atomic */
   inline bool is_atomic() const {
     return inputs.size() == 0 && op != nullptr;
@@ -118,7 +151,7 @@ inline void KeywordArgumentMismatch(const char *source,
   }
 
   for (const auto& key : user_args) {
-    if (keys.count(key) == 0 && key.substr(key.size() - 5, 5) != "label") {
+    if (keys.count(key) == 0) {
       LOG(FATAL) << source
                  << "Keyword argument name " << key << " not found."
                  << msg.str();
diff --git a/tests/python/common/get_data.py b/tests/python/common/get_data.py
index 65e8ac59ad6f..db7165e2903c 100644
--- a/tests/python/common/get_data.py
+++ b/tests/python/common/get_data.py
@@ -18,7 +18,7 @@ def GetMNIST_ubyte():
        (not os.path.exists('data/train-labels-idx1-ubyte')) or \
        (not os.path.exists('data/t10k-images-idx3-ubyte')) or \
        (not os.path.exists('data/t10k-labels-idx1-ubyte')):
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip -P data/")
+        os.system("wget http://data.dmlc.ml/mxnet/data/mnist.zip -P data/")
         os.chdir("./data")
         os.system("unzip -u mnist.zip")
         os.chdir("..")
@@ -28,7 +28,7 @@ def GetCifar10():
     if not os.path.isdir("data/"):
         os.system("mkdir data/")
     if not os.path.exists('data/cifar10.zip'):
-        os.system("wget http://webdocs.cs.ualberta.ca/~bx3/data/cifar10.zip -P data/")
+        os.system("wget http://data.dmlc.ml/mxnet/data/cifar10.zip -P data/")
         os.chdir("./data")
         os.system("unzip -u cifar10.zip")
         os.chdir("..")
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 7bf532d8468c..daa60e1779a0 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1,4 +1,4 @@
-import sys
+﻿import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
@@ -8,20 +8,23 @@
 from numpy.testing import assert_allclose
 import time
 
-def check_consistency(sym, ctx_list, scale=1.0):
+def check_consistency(sym, ctx_list, scale=1.0, grad_req='write'):
     tol = {np.dtype(np.float16): 1e-1,
            np.dtype(np.float32): 1e-3,
            np.dtype(np.float64): 1e-5,
            np.dtype(np.uint8): 0,
            np.dtype(np.int32): 0}
     assert(len(ctx_list) > 1)
-    exe_list = [sym.simple_bind(grad_req='write', **ctx) for ctx in ctx_list]
+    exe_list = [sym.simple_bind(grad_req=grad_req, **ctx) for ctx in ctx_list]
     for exe in exe_list:
         assert(len(exe.outputs) == 1)
         assert(len(exe.arg_arrays) == len(exe_list[0].arg_arrays))
         assert(len(exe.grad_arrays) == len(exe_list[0].grad_arrays))
 
     init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe_list[0].arg_arrays]
+    if sym.name == 'embedding':
+        init[0] = np.random.randint(low=0, high=10, size=exe_list[0].arg_arrays[0].shape)
+
     for exe in exe_list:
         for arr, iarr in zip(exe.arg_arrays, init):
             arr[:] = iarr.astype(arr.dtype)
@@ -32,7 +35,8 @@ def check_consistency(sym, ctx_list, scale=1.0):
         exe.backward(exe.outputs[0])
 
     outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
-    grads = [[grad.asnumpy() for grad in exe.grad_arrays] for exe in exe_list]
+    # lazy solution handling None grad
+    grads = [[grad.asnumpy() if grad is not None else np.zeros(1) for grad in exe.grad_arrays] for exe in exe_list]
     dtypes = [arr.dtype for arr in outputs]
     max_idx = np.argmax(dtypes)
 
@@ -46,8 +50,26 @@ def check_consistency(sym, ctx_list, scale=1.0):
             except Exception, e:
                 print e
 
-def check_speed(sym, ctx, scale=1.0, N=100):
-    exe = sym.simple_bind(grad_req='write', **ctx)
+    #forward predict
+    for exe in exe_list:
+        exe.forward(is_train=False)
+
+    outputs = [exe.outputs[0].asnumpy() for exe in exe_list]
+    dtypes = [arr.dtype for arr in outputs]
+    max_idx = np.argmax(dtypes)
+
+    for i, exe in enumerate(exe_list):
+        if i == max_idx:
+            continue
+        for arr1, arr2 in zip([outputs[i]], [outputs[max_idx]]):
+            arr2 = arr2.astype(dtypes[i])
+            try:
+                assert_allclose(arr1, arr2, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
+            except Exception, e:
+                print e
+
+def check_speed(sym, ctx, scale=1.0, N=100, grad_req='write'):
+    exe = sym.simple_bind(grad_req=grad_req, **ctx)
     init = [np.random.normal(size=arr.shape, scale=scale) for arr in exe.arg_arrays]
     for arr, iarr in zip(exe.arg_arrays, init):
         arr[:] = iarr.astype(arr.dtype)
@@ -64,8 +86,14 @@ def check_speed(sym, ctx, scale=1.0, N=100):
         exe.outputs[0].wait_to_read()
     return (time.time() - tic)*1.0/N
 
+def test_batchnorm_with_type():
+    sym = mx.sym.BatchNorm(name='norm', fix_gamma=False)
+    ctx_list = [{'ctx': mx.gpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
+                {'ctx': mx.cpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}}]
+    check_consistency(sym, ctx_list)
 
-
+    sym = mx.sym.BatchNorm(name='norm', fix_gamma=True)
+    check_consistency(sym, ctx_list)
 
 def test_convolution_with_type():
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), name='conv')
@@ -77,13 +105,78 @@ def test_convolution_with_type():
     check_consistency(sym, ctx_list)
 
 def test_deconvolution_with_type():
-    sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='conv')
-    ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},
-                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float32}},
-                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float16}},
-                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float64}},
-                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 10, 10), 'type_dict': {'conv_data': np.float32}}]
-    check_type_consistency(sym, ctx_list)
+    sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='deconv')
+    ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
+                {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float16}},
+                {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
+                {'ctx': mx.cpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_upsampling_with_type():
+    sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type = 'nearest', num_args=1)
+    ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
+                {'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float32}},
+                {'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float16}},
+                {'ctx': mx.cpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
+                {'ctx': mx.cpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_concat_with_type():
+    sym = mx.sym.Concat(name='concat', num_args=2)
+    ctx_list = [{'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float64, 'concat_arg1': np.float64}},
+                {'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}},
+                {'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float16, 'concat_arg1': np.float16}},
+                {'ctx': mx.cpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float64, 'concat_arg1': np.float64}},
+                {'ctx': mx.cpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
+                 'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_elementwisesum_with_type():
+    sym = mx.sym.ElementWiseSum(name='ews', num_args=2)
+    ctx_list = [{'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float64, 'ews_arg1': np.float64}},
+                {'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float32, 'ews_arg1': np.float32}},
+                {'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float16, 'ews_arg1': np.float16}},
+                {'ctx': mx.cpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float64, 'ews_arg1': np.float64}},
+                {'ctx': mx.cpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
+                 'type_dict': {'ews_arg0': np.float32, 'ews_arg1': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+
+def test_reshape_with_type():
+    sym = mx.sym.Reshape(name='reshape', shape=(-1,1,1,0))
+    ctx_list = [{'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float64}},
+                {'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float32}},
+                {'ctx': mx.gpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float16}},
+                {'ctx': mx.cpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float64}},
+                {'ctx': mx.cpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_blockgrad_with_type():
+    sym = mx.sym.BlockGrad(name='bg')
+    ctx_list = [{'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float64}},
+                {'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float32}},
+                {'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float16}},
+                {'ctx': mx.cpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float64}},
+                {'ctx': mx.cpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float32}}]
+    check_consistency(sym, ctx_list)
+
+def test_swapaxis_with_type():
+    sym = mx.sym.SwapAxis(name='swap', dim1=1)
+    ctx_list = [{'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float64}},
+                {'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float32}},
+                {'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float16}},
+                {'ctx': mx.cpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float64}},
+                {'ctx': mx.cpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float32}}]
+    check_consistency(sym, ctx_list)
 
 def test_fullyconnected_with_type():
     sym = mx.sym.FullyConnected(num_hidden=3, name='inner')
@@ -104,10 +197,29 @@ def test_activation_with_type():
                 {'ctx': mx.cpu(0), 'act_data': (2, 2, 10, 10), 'type_dict': {'act_data': np.float16}}]
     check_consistency(sym, ctx_list)
 
+def test_embedding_with_type():
+    sym = mx.sym.Embedding(name='embedding', input_dim=10, output_dim=20)
+    ctx_list = [{'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float64}},
+                {'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float32}},
+                {'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float16}},
+                {'ctx': mx.cpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float64}},
+                {'ctx': mx.cpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float32}},
+                {'ctx': mx.cpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float16}}]
+    check_consistency(sym, ctx_list, grad_req={'embedding_data': 'null','embedding_weight': 'write'})
+
 if __name__ == '__main__':
+    test_batchnorm_with_type()
     test_convolution_with_type()
     test_deconvolution_with_type()
+    test_upsampling_with_type()
+    test_concat_with_type()
+    test_elementwisesum_with_type()
+    test_reshape_with_type()
+    test_blockgrad_with_type()
+    test_swapaxis_with_type()
     test_fullyconnected_with_type()
     test_activation_with_type()
-	#test_softmax_with_shape((3,4), mx.gpu())
-    #test_multi_softmax_with_shape((3,4,5), mx.gpu())
\ No newline at end of file
+    test_embedding_with_type()
+    #test_softmax_with_shape((3,4), mx.gpu())
+    #test_multi_softmax_with_shape((3,4,5), mx.gpu())
+
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 8a8049590b34..45537ff7540b 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -34,7 +34,7 @@ def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=
             out2 = uf(*numpy_arg).astype(dtype)
         else:
             out2 = npuf(*numpy_arg).astype(dtype)
-            
+
         assert out1.shape == out2.shape
         if isinstance(out1, mx.nd.NDArray):
             out1 = out1.asnumpy()
@@ -205,7 +205,7 @@ def test_dot():
     assert reldiff(c, C.asnumpy()) < 1e-5
 
 def test_reduce():
-    sample_num = 1000
+    sample_num = 200
     def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
         for i in range(sample_num):
             ndim = np.random.randint(1, 8)
@@ -223,10 +223,12 @@ def test_reduce_inner(numpy_reduce_func, nd_reduce_func):
                 axes = tuple(axes)
             numpy_ret = numpy_reduce_func(dat, axis=axes, keepdims=keepdims)
 
-            ndarray_ret = nd_reduce_func(arr=mx.nd.array(dat), axis=axes, keepdims=keepdims)
+            ndarray_ret = nd_reduce_func(mx.nd.array(dat), axis=axes, keepdims=keepdims)
             if type(ndarray_ret) is mx.ndarray.NDArray:
                 ndarray_ret = ndarray_ret.asnumpy()
-            assert ndarray_ret.shape == numpy_ret.shape
+            assert (ndarray_ret.shape == numpy_ret.shape) or \
+                   (ndarray_ret.shape == (1,) and numpy_ret.shape == ()), "nd:%s, numpy:%s" \
+                                                         %(ndarray_ret.shape, numpy_ret.shape)
             err = np.square(ndarray_ret - numpy_ret).mean()
             assert err < 1E-4
     test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.sum),
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 4638f9c905d1..a9fc189cc727 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -10,6 +10,13 @@
 def same(a, b):
     return np.sum(a != b) == 0
 
+def np_softmax(x):
+    x = x - np.max(x, axis=1).reshape(x.shape[0], 1)
+    x = np.exp(x)
+    x /= np.sum(x, axis=1).reshape(x.shape[0], 1)
+    return x
+
+
 def check_elementwise_sum_with_shape(shape, n):
     # forward
     inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
@@ -235,20 +242,23 @@ def check_softmax_with_ignore_label(xpu):
     assert(reldiff(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):]) < 1e-5)
 
 def check_softmax_with_shape(shape, xpu):
+    # bind with label
     X = mx.symbol.Variable('X')
     L = mx.symbol.Variable('L')
     Y = mx.symbol.SoftmaxOutput(data=X, label=L)
     x = mx.random.uniform(-1, 1, shape, ctx = xpu)
-    l = mx.nd.empty((shape[0],), ctx = xpu)
-    l[:] = np.random.randint(0, shape[1]-1, (shape[0],))
+    l = mx.random.uniform(-1, 1, shape, ctx = xpu)
+    l[:] = np_softmax(l.asnumpy())
     grad = mx.nd.empty(shape, ctx = xpu)
-
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    print('foward')
     exec1.forward()
-    print(exec1.outputs[0].asnumpy())
+    out = exec1.outputs[0].asnumpy()
+    assert_allclose(out, np_softmax(x.asnumpy()))
     exec1.backward()
-    print(grad.asnumpy())
+    assert_allclose(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy())
+
+def test_softmax():
+    check_softmax_with_shape((3, 4), mx.cpu())
 
 def check_multi_softmax_with_shape(shape, xpu):
     X = mx.symbol.Variable('X')
@@ -625,7 +635,31 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
     exe_deconv.backward(deconv_out_grad)
     assert reldiff(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy()) < 1e-6
 
+def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, target_shape=None):
+    data = mx.sym.Variable(name="data")
+    deconv = mx.sym.Deconvolution(
+        data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5,
+        target_shape = target_shape if target_shape is not None else (0, 0))
+    arg_names = deconv.list_arguments()
+    arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape)
+    assert out_shapes[0] == (input_shape[0], 5, 8, 8)
+
 def test_deconvolution():
+    check_deconvolution_target_shape(
+        input_shape         = (2,3,4,4),
+        kernel              = (3,3),
+        stride              = (2,2),
+        target_shape        = (8,8),
+        pad                 = (99,99),  # will be ignored
+        adj                 = (101,101),  # will be ignored
+    )
+    check_deconvolution_target_shape(
+        input_shape         = (2,3,4,4),
+        kernel              = (3,3),
+        stride              = (2,2),
+        pad                 = (1,1),
+        adj                 = (1,1),
+    )
     check_deconvolution_forward_backward(
         input_shape         = (1,1,5,5),
         num_filter          = 1,
@@ -696,12 +730,6 @@ def test_batchnorm_training():
 
         check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
 
-        # Gamma needs to be fixed at one when fix_gamma is true,
-        gamma = np.ones(s)
-
-        test = mx.symbol.BatchNorm(data, fix_gamma=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-3, check_eps=5e-2)
-
 def test_convolution_grouping():
     num_filter = 4
     num_group = 2
@@ -733,24 +761,27 @@ def test_convolution_grouping():
         np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3)
 
 def _gen_broadcast_data():
-    testing_shapes = [(2, 3, 4), (3, 5, 7), (4, 2, 6)]
-    shape_pairs = []
-    for n, m, k in testing_shapes:
-        shape_pairs += [((1,), (1,)),
-                       ((n,), (n,)),
-                       ((n,m), (n,m)),
-                       ((n,m,k), (n,m,k)),
-                       ((n,1), (1,n)),
-                       ((n,m,k), (n,1,1)),
-                       ((n,m,k), (1,m,1)),
-                       ((n,m,k), (1,m,k)),
-                       ((n,m,k), (n,m,1)),
-                       ((n,m,k), (1,1,k))]
-    shape_pairs += [(v, u) for (u, v) in shape_pairs]
-    return [(np.random.random(u), np.random.random(v)) for (u,v) in shape_pairs]
+    # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
+    ndim = np.random.randint(1, 8)
+    shape = np.random.randint(1, 6, size=(ndim,))
+    l_same_dim = np.random.randint(0, 5)
+    r_same_dim = np.random.randint(0, 5)
+    l_axis_flags = np.random.randint(0, 2, size=ndim)
+    r_axis_flags = np.random.randint(0, 2, size=ndim)
+    if l_same_dim == 4:
+        l_axis_flags = np.ones(ndim)
+    if r_same_dim == 4:
+        r_axis_flags = np.ones(ndim)
+    l_shape = shape.copy()
+    r_shape = shape.copy()
+    l_shape[np.where(l_axis_flags == 0)] = 1
+    r_shape[np.where(r_axis_flags == 0)] = 1
+    return [np.random.random(l_shape), np.random.random(r_shape)]
 
 def _check_broadcast_op_forward(symbol, baseline):
-    for d in _gen_broadcast_data():
+    sample_num = 200
+    for i in range(sample_num):
+        d = _gen_broadcast_data()
         x = baseline(d[0], d[1])
         y = symbol.bind(mx.cpu(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])})
         y.forward()
@@ -759,8 +790,10 @@ def _check_broadcast_op_forward(symbol, baseline):
             err, d[0].shape, d[1].shape)
 
 def _check_broadcast_op_backward(symbol, baseline):
-    for d in _gen_broadcast_data():
-        out = d[0] + d[1]
+    sample_num = 200
+    for i in range(sample_num):
+        d = _gen_broadcast_data()
+        out = np.random.random((d[0] + d[1]).shape)
         def reduce_op(shape, x):
             if shape == x.shape:
                 return x
@@ -782,7 +815,7 @@ def reduce_op(shape, x):
         err = lambda x, y: np.sum(np.abs(x-y)) / np.sum(np.abs(x))
         err_1 = err(x_1, y_1.asnumpy())
         err_2 = err(x_2, y_2.asnumpy())
-        assert err_1 < 1e-6 and err_2 < 1e-6, 'lhs error %f, rhs error %f, shapes are %s %s' % (
+        assert err_1 < 1e-5 and err_2 < 1e-5, 'lhs error %f, rhs error %f, shapes are %s %s' % (
             err_1, err_2, d[0].shape, d[1].shape)
 
 def test_broadcast_binary_op():
@@ -809,10 +842,17 @@ def test_bdiv(a, b):
         _check_broadcast_op_forward(c, lambda a, b: a / b)
         _check_broadcast_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)))
 
+    def test_bpow(a, b):
+        c = mx.sym.broadcast_power(a, b)
+        _check_broadcast_op_forward(c, lambda a, b: a ** b)
+        _check_broadcast_op_backward(c, lambda g_out, a, b: (g_out * a **(b - 1) * b,
+                                                             g_out * a ** b * np.log(a)))
+
     test_bplus(a, b)
     test_bminus(a, b)
     test_bmul(a, b)
     test_bdiv(a, b)
+    test_bpow(a, b)
 
 def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):
     # Input for spike response
@@ -927,12 +967,12 @@ def test_reshape_new(src_shape, shape_args, dst_shape):
     assert(output_shape[0] == (2, 75))
 
 def test_reduce():
-    sample_num = 1000
+    sample_num = 200
     def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
         for i in range(sample_num):
-            # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
+            # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
             ndim = np.random.randint(1, 8)
-            shape = np.random.randint(1, 11, size=(ndim,))
+            shape = np.random.randint(1, 6, size=(ndim,))
             axis_num = np.random.randint(0, ndim, size=1)
             axis_flags = np.random.randint(0, 2, size=ndim)
             axes = []
@@ -941,11 +981,16 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
                     axes.append(axis)
             if 0 == len(axes):
                 axes = None
+            elif 1 == len(axes):
+                axes = axes[0]
             else:
                 axes = tuple(axes)
             keepdims = np.random.randint(0, 2)
             a = mx.symbol.Variable('a')
-            b = mx_reduce_sym(a, axis=axes, keepdims=keepdims)
+            if axes is None:
+                b = mx_reduce_sym(a, keepdims=keepdims)
+            else:
+                b = mx_reduce_sym(a, axis=axes, keepdims=keepdims)
             dat_npy = np.random.rand(*shape)
             sum_groundtruth = np.array(numpy_reduce_func(dat_npy, axis=axes, keepdims=keepdims))
             if sum_groundtruth.shape == ():
@@ -958,45 +1003,48 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym):
                          args_grad={'a': grad_nd})
             net.forward(is_train=True)
 
-            err_forward = np.square(net.outputs[0].asnumpy() - sum_groundtruth).sum()/np.prod(shape)
-            assert err_forward < 1E-6
+            err_forward = reldiff(net.outputs[0].asnumpy(), sum_groundtruth)
+            assert err_forward < 1E-4
             net.backward(out_grads=mx.nd.array(outgrad_npy))
-            err_backward = np.square(grad_nd.asnumpy() - grad_groundtruth).sum()
-            assert err_backward < 1E-6
+            err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth)
+            assert err_backward < 1E-4
     test_reduce_inner(lambda data, axis, keepdims:_np_reduce(data, axis, keepdims, np.sum),
                       lambda outgrad, data, axis, keepdims:
                         outgrad.reshape(_np_reduce(data, axis, 1, np.sum).shape),
                       mx.symbol.sum)
 
 def test_broadcast():
-    sample_num = 1000
-    def test_broadcast_axis():
-        for i in range(sample_num):
-            # Generate random data that has ndim between 1-7 and all the shape dims between 1-10
-            ndim = np.random.randint(1, 8)
-            target_shape = np.random.randint(1, 11, size=(ndim,))
-            axis = np.random.randint(0, ndim)
-            shape = target_shape.copy()
-            size = shape[axis]
-            shape[axis] = 1
-            a = mx.symbol.Variable('a')
-            b = mx.symbol.broadcast_axis(a, axis=axis, size=size)
+    sample_num = 200
+    for i in range(sample_num):
+        # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
+        ndim = np.random.randint(1, 8)
+        target_shape = np.random.randint(1, 6, size=(ndim,))
+        axis = tuple(set(np.random.randint(0, ndim, np.random.randint(1, ndim + 1))))
+        shape = target_shape.copy()
+        size = tuple([shape[ele] for ele in axis])
+        for ele in axis:
+            shape[ele] = 1
+        a = mx.symbol.Variable('a')
+        sym_bcast_axis = mx.symbol.broadcast_axis(a, axis=axis, size=size)
+        sym_bcast_to = mx.symbol.broadcast_to(a, shape=tuple(target_shape))
+        def test_broadcasting_ele(sym_bcast):
             dat_npy = np.random.rand(*shape)
             groundtruth = dat_npy
             grad_nd = mx.nd.empty(shape)
             outgrad_npy = np.random.rand(*target_shape)
             grad_groundtruth = _np_reduce(outgrad_npy, axis=axis, keepdims=True,
                                           numpy_reduce_func=np.sum)
-            net = b.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
-                         args_grad={'a': grad_nd})
+            net = sym_bcast.bind(mx.cpu(), args={'a': mx.nd.array(dat_npy)},
+                                                 args_grad={'a': grad_nd})
             net.forward(is_train=True)
             assert (net.outputs[0].shape == target_shape).all()
-            err_forward = np.square(net.outputs[0].asnumpy() - groundtruth).mean()
-            assert err_forward < 1E-8
+            err_forward = reldiff(net.outputs[0].asnumpy(), groundtruth)
+            assert err_forward < 1E-4
             net.backward(out_grads=mx.nd.array(outgrad_npy))
-            err_backward = np.square(grad_nd.asnumpy() - grad_groundtruth).mean()
-            assert err_backward < 1E-8
-    test_broadcast_axis()
+            err_backward = reldiff(grad_nd.asnumpy(), grad_groundtruth)
+            assert err_backward < 1E-4
+        test_broadcasting_ele(sym_bcast_axis)
+        test_broadcasting_ele(sym_bcast_to)
 
 def test_transpose():
     for ndim in range(1, 6):
@@ -1012,6 +1060,17 @@ def test_transpose():
             y = mx.nd.transpose(x)
             assert_allclose(np.transpose(x.asnumpy()), y.asnumpy())
 
+
+def test_expand_dims():
+    for ndim in range(1, 6):
+        for t in range(5):
+            dims = list(np.random.randint(1, 10, size=ndim))
+            axis = np.random.randint(1, ndim+1)
+            x = mx.nd.array(np.random.normal(size=dims))
+            y = mx.nd.expand_dims(x, axis=axis)
+            assert_allclose(np.expand_dims(x.asnumpy(), axis=axis), y.asnumpy())
+
+
 def test_crop():
     for ndim in range(1, 6):
         for t in range(5):
@@ -1031,6 +1090,35 @@ def test_crop():
             y = mx.nd.crop(x, begin=tuple(begin), end=tuple(end))
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
+
+def test_slice_axis():
+    for ndim in range(1, 6):
+        shape = np.random.randint(1, 11, size=(ndim,))
+        for t in range(ndim):
+            d = shape[t]
+            b = random.randint(0, d-1)
+            e = random.randint(b+1, d)
+            idx = []
+            for i in range(ndim):
+                idx.append(slice(0, shape[i]))
+            idx[t] = slice(b, e)
+
+            X = mx.symbol.Variable('X')
+            x = mx.nd.array(np.random.normal(size=shape))
+            Y = mx.symbol.slice_axis(data=X, axis=t, begin=b, end=e)
+
+            xgrad = mx.nd.empty(x.shape)
+            exec1 = Y.bind(mx.cpu(), args = [x], args_grad = {'X': xgrad})
+            exec1.forward()
+            y = exec1.outputs[0]
+            assert_allclose(x.asnumpy()[idx], y.asnumpy())
+            exec1.backward([y])
+            xx = x.asnumpy()
+            xx[:] = 0.0
+            xx[idx] = x.asnumpy()[idx]
+            assert_allclose(xx, xgrad.asnumpy())
+
+
 def test_flip():
     for ndim in range(1, 6):
         for t in range(5):
@@ -1041,7 +1129,319 @@ def test_flip():
             y = mx.nd.flip(x, axis=axis)
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
+
+def test_stn():
+    import pdb
+    np.set_printoptions(threshold=np.nan)
+    num_filter = 2  # conv of loc net
+    kernel = (3, 3)  # conv of loc net
+    num_hidden = 6  # fc of loc net
+    for n in [1, 2, 3, 4]:
+        for c in [1, 2, 3, 4]:
+            for h in [5, 9, 13, 17]:  # for convenience test, this third and forth input dim should be 4x + 1
+                for w in [5, 9, 13, 17]:
+                    data_shape = (n, c, h, w)
+                    target_shape = (int((data_shape[2]+1)/2), int((data_shape[3]+1)/2))
+                    data = mx.sym.Variable(name="data")
+                    loc = mx.sym.Convolution(data=data, kernel=kernel, pad=(1, 1), num_filter=num_filter, name="loc_conv")
+                    loc = mx.sym.Flatten(data=loc)
+                    loc = mx.sym.FullyConnected(data=loc, num_hidden=num_hidden, name="loc_fc")
+                    stn = mx.sym.SpatialTransformer(data=data, loc=loc, target_shape=target_shape,
+                                                    transform_type="affine", sampler_type="bilinear")
+                    arg_names = stn.list_arguments()
+                    arg_shapes, out_shapes, _ = stn.infer_shape(data=data_shape)
+                    # check shape
+                    assert out_shapes[0] == (data_shape[0], data_shape[1], target_shape[0], target_shape[1])
+                    dev = mx.cpu()
+                    #dev = mx.gpu(0)
+                    args = {}
+                    args['data'] = mx.random.normal(0, 1, data_shape, dev)
+                    args['loc_conv_weight'] = mx.nd.zeros((num_filter, data_shape[1], kernel[0], kernel[1]), ctx=dev)
+                    args['loc_conv_bias'] = mx.nd.zeros((num_filter,), ctx=dev)
+                    args['loc_fc_weight'] = mx.nd.zeros((6, num_filter*data_shape[2]*data_shape[3]), ctx=dev)
+                    args['loc_fc_bias'] = mx.nd.array([0.5, 0, 0, 0, 0.5, 0], ctx=dev)
+                    grad_grad = [mx.nd.zeros(shape, ctx=dev) for shape in arg_shapes]
+                    exe = stn.bind(dev, args=args, args_grad=grad_grad)
+                    exe.forward(is_train=True)
+                    out = exe.outputs[0].asnumpy()
+                    # check forward
+                    reldiff(out, args['data'].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6
+                    out_grad = mx.nd.ones(out.shape, ctx=dev)
+                    exe.backward([out_grad])
+                    # check backward
+                    reldiff(out_grad.asnumpy(), grad_grad[0].asnumpy()[:, :, h//4:h-h//4, w//4:w-w//4]) < 1e-6
+
+
+def test_dot(ctx=mx.cpu()):
+    for m in range(1, 5):
+        for k in range(1, 5):
+            for n in range(1, 5):
+                a_npy = np.random.normal(0, 1, (m, k))
+                b_npy = np.random.normal(0, 1, (k, n))
+                c_npy = np.empty((m, n))
+                ograd_npy = np.random.normal(0, 1, (m, n))
+                agrad_npy = np.empty((m, k))
+                bgrad_npy = np.empty((k, n))
+                c_npy[:, :] = np.dot(a_npy[:, :], b_npy[:, :])
+                bgrad_npy[:, :] = np.dot(a_npy[:, :].T, ograd_npy[:, :])
+                agrad_npy[:, :] = np.dot(ograd_npy[:, :], b_npy[:, :].T)
+                a = mx.sym.Variable('a')
+                b = mx.sym.Variable('b')
+                c = mx.sym.dot(a, b)
+                exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape)
+                outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
+                assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
+                exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
+                assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
+
+
+def test_batch_dot(ctx=mx.cpu()):
+    for batch_size in range(1, 5):
+        for m in range(1, 5):
+            for k in range(1, 5):
+                for n in range(1, 5):
+                    a_npy = np.random.normal(0, 1, (batch_size, m, k))
+                    b_npy = np.random.normal(0, 1, (batch_size, k, n))
+                    c_npy = np.empty((batch_size, m, n))
+                    ograd_npy = np.random.normal(0, 1, (batch_size, m, n))
+                    agrad_npy = np.empty((batch_size, m, k))
+                    bgrad_npy = np.empty((batch_size, k, n))
+                    for i in range(batch_size):
+                        c_npy[i, :, :] = np.dot(a_npy[i, :, :], b_npy[i, :, :])
+                        bgrad_npy[i, :, :] = np.dot(a_npy[i, :, :].T, ograd_npy[i, :, :])
+                        agrad_npy[i, :, :] = np.dot(ograd_npy[i, :, :], b_npy[i, :, :].T)
+                    a = mx.sym.Variable('a')
+                    b = mx.sym.Variable('b')
+                    c = mx.sym.batch_dot(a, b)
+                    exe = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape, grad_req='write')
+                    exe_add = c.simple_bind(ctx=ctx, a=a_npy.shape, b=b_npy.shape, grad_req='add')
+                    a_init_grad_npy = np.random.normal(size=(batch_size, m, k))
+                    b_init_grad_npy = np.random.normal(size=(batch_size, k, n))
+                    exe_add.grad_dict['a'][:] = a_init_grad_npy
+                    exe_add.grad_dict['b'][:] = b_init_grad_npy
+                    outputs = exe.forward(is_train=True, a=a_npy, b=b_npy)
+                    assert reldiff(outputs[0].asnumpy(), c_npy) < 1E-3
+                    exe.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                    assert reldiff(exe.grad_dict['a'].asnumpy(), agrad_npy) < 1E-3
+                    assert reldiff(exe.grad_dict['b'].asnumpy(), bgrad_npy) < 1E-3
+                    exe_add.forward(is_train=True, a=a_npy, b=b_npy)
+                    exe_add.backward(out_grads=[mx.nd.array(ograd_npy, ctx=exe._ctx)])
+                    assert reldiff(exe_add.grad_dict['a'].asnumpy(),
+                                   agrad_npy + a_init_grad_npy) < 1E-3
+                    assert reldiff(exe_add.grad_dict['b'].asnumpy(),
+                                   bgrad_npy + b_init_grad_npy) < 1E-3
+
+def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
+    
+    img1 = mx.sym.Variable('img1')
+    img2 = mx.sym.Variable('img2')
+    return mx.sym.Correlation(data1=img1,data2=img2,kernel_size =kernel_size,max_displacement = max_displacement,
+                              stride1 = stride1,stride2 = stride2,pad_size= pad_size,is_multiply = is_multiply)
+
+def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply):
+    
+    # compute output's dimension
+    paddedbottomheight = data1.shape[2] + 2 * pad_size
+    paddedbottomwidth = data1.shape[3] + 2 * pad_size
+    kernel_radius = (kernel_size - 1) // 2
+    border_size = max_displacement + kernel_radius
+    top_width = (paddedbottomwidth - border_size * 2) // stride1
+    top_height = (paddedbottomheight - border_size  * 2) // stride1
+    neighborhood_grid_radius = max_displacement // stride2
+    neighborhood_grid_width = neighborhood_grid_radius * 2 + 1
+    top_channels = neighborhood_grid_width * neighborhood_grid_width
+    
+    out = np.zeros((data1.shape[0], top_channels, top_height, top_width))
+    tmp1 = np.zeros((data1.shape[0],data1.shape[1],paddedbottomheight, paddedbottomwidth))
+    tmp2 = np.zeros((data1.shape[0],data1.shape[1],paddedbottomheight, paddedbottomwidth))
+    
+    tmp1[:, :, pad_size:pad_size + data1.shape[2], pad_size:pad_size + data1.shape[3]] = data1[:,:,:,:]
+    tmp2[:, :, pad_size:pad_size + data2.shape[2], pad_size:pad_size + data2.shape[3]] = data2[:,:,:,:]
+    
+    for i in range(top_height):
+        for j in range(top_width):
+            for nbatch in range(data1.shape[0]):
+                
+                # x1,y1 is the location in data1 , i,j is the location in output
+                x1 = j * stride1 + max_displacement
+                y1 = i * stride1 + max_displacement
+                
+                for top_channel in range(top_channels):
+                    
+                    s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    s2p = (top_channel // neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    
+                    # location in data2 
+                    x2 = x1 + s2o
+                    y2 = y1 + s2p
+                    
+                    for h in range(kernel_size):
+                        for w in range(kernel_size):
+                            for channel in range(data1.shape[1]):
+                                if is_multiply:
+                                    out[nbatch, top_channel, i, j] += tmp1[nbatch, channel,y1 + h, x1 + w] * tmp2[nbatch, channel, y2 + h,x2 + w]
+                                else:
+                                    out[nbatch, top_channel, i, j] += abs(tmp1[nbatch, channel, y1 + h, x1 + w] - tmp2[nbatch, channel, y2 + h, x2 + w])
+    out /= float(kernel_size**2*data1.shape[1])
+    return out,tmp1,tmp2
+
+def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply):
+    
+    # compute output's dimension 
+    paddedbottomheight = data1.shape[2] + 2 * pad_size
+    paddedbottomwidth = data1.shape[3] + 2 * pad_size
+    kernel_radius = (kernel_size - 1) // 2
+    border_size = max_displacement + kernel_radius
+    top_width = (paddedbottomwidth - border_size * 2) // stride1
+    top_height = (paddedbottomheight - border_size  * 2) // stride1
+    neighborhood_grid_radius = max_displacement // stride2
+    neighborhood_grid_width = neighborhood_grid_radius * 2 + 1
+    top_channels = neighborhood_grid_width * neighborhood_grid_width
+    
+    out = np.zeros((data1.shape[0], top_channels, top_height, top_width))
+    tmp1_grad = np.zeros(tmp1.shape)
+    tmp2_grad = np.zeros(tmp2.shape)
+    
+    for i in range(top_height):
+        for j in range(top_width):
+            for nbatch in range(data1.shape[0]):
+                
+                # x1,y1 is the location in data1 , i,j is the location in output
+                x1 = j * stride1 + max_displacement
+                y1 = i * stride1 + max_displacement
+                
+                for top_channel in range(top_channels):
+                    
+                    s2o = (top_channel % neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    s2p = (top_channel // neighborhood_grid_width - neighborhood_grid_radius) * stride2
+                    
+                    # location in data2 
+                    x2 = x1 + s2o
+                    y2 = y1 + s2p
+                    
+                    for h in range(kernel_size):
+                        for w in range(kernel_size):
+                            for channel in range(data1.shape[1]):
+                                if is_multiply:
+                                    tmp1_grad[nbatch,channel,y1+h,x1+w]+= out_grad[nbatch,top_channel,i,j]*tmp2[nbatch, channel, y2 + h,x2 + w]
+                                    tmp2_grad[nbatch,channel,y2+h,x2+w]+= out_grad[nbatch,top_channel,i,j]*tmp1[nbatch, channel, y1 + h,x1 + w]
+                                else:
+                                    sgn = 1 if (tmp1[nbatch, channel, y1 + h,x1 + w]>=tmp2[nbatch, channel, y2 + h,x2 + w]) else -1
+                                    tmp1_grad[nbatch,channel,y1+h,x1+w]+= out_grad[nbatch,top_channel,i,j]*sgn
+                                    tmp2_grad[nbatch,channel,y2+h,x2+w]+= out_grad[nbatch,top_channel,i,j]*(-sgn)
+    
+    tmp1_grad = tmp1_grad / float(kernel_size**2*data1.shape[1])
+    tmp2_grad = tmp2_grad / float(kernel_size**2*data1.shape[1])
+    return tmp1_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],tmp2_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],
+
+def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
+    
+    img1 = np.random.random(data_shape)
+    img2 = np.random.random(data_shape)
+
+    net1 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply)
+    net2 = get_correlation(img1,img2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply )
+
+    exe1 = net1.simple_bind(mx.cpu(),img1=img1.shape,img2=img1.shape)
+    exe1.arg_dict['img1'][:] = img1
+    exe1.arg_dict['img2'][:] = img2
+
+    #cpu forward
+    exe1.forward()  
+    # python forward
+    forward_result,tmp1,tmp2 = correlation_forward(img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply)
+
+    # forward error
+    assert np.abs(exe1.outputs[0].asnumpy()-forward_result).mean()<1e-4
+    
+    # out_grad 
+    a = np.ones(forward_result.shape)
+    out_grad1 = mx.nd.array(a,mx.cpu())
+    # cpu backward
+    exe1.backward(out_grads=out_grad1)
+    # python backward
+    grad1,grad2 = correlation_backward(a,tmp1,tmp2,img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply)
+
+    # backward error 
+    assert np.abs(exe1.grad_dict['img1'].asnumpy() - grad1).mean() < 1e-4
+    assert np.abs(exe1.grad_dict['img2'].asnumpy() - grad2).mean() < 1e-4
+
+def test_correlation():
+    
+    unittest_correlation((1,3,10,10), kernel_size = 1,max_displacement = 4,stride1 = 1,stride2 = 1,pad_size = 4,is_multiply = False)
+    unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = False)
+    unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = True)
+    unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 10,stride1 = 1,stride2 = 2,pad_size = 10,is_multiply = True)
+    unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = True)
+    unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = True)
+    unittest_correlation((5,1,4,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False)
+    unittest_correlation((5,1,6,4), kernel_size = 3,max_displacement = 1,stride1 = 2,stride2 = 1,pad_size = 2,is_multiply = False)
+    unittest_correlation((5,1,11,11), kernel_size = 5,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = False)
+
+
+def test_support_vector_machine_l1_svm():
+    xpu = mx.cpu()
+    shape = (20, 10)
+
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.SVMOutput(data=X, label=L, use_linear=True)
+    x = mx.nd.empty(shape, ctx = xpu)
+    l = mx.nd.empty((shape[0],), ctx = xpu)
+    x_np = np.random.rand(*shape)
+    l_np = np.random.randint(0, shape[1], (shape[0],))
+    x[:] = x_np
+    l[:] = l_np
+
+    grad = mx.nd.empty(shape, ctx = xpu)
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    exec1.forward()
+
+    assert_allclose(x_np, exec1.outputs[0].asnumpy())
+    
+    exec1.backward()
+
+    l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1]))
+    l_mask = np.array(l_mask, dtype=np.float32)*2 -1
+    grad_np = (-1) * l_mask * np.greater(1 - l_mask * x_np, 0)
+
+    assert_allclose(grad_np, grad.asnumpy())
+
+def test_support_vector_machine_l2_svm():
+    xpu = mx.cpu()
+    shape = (20, 10)
+
+    X = mx.symbol.Variable('X')
+    L = mx.symbol.Variable('L')
+    Y = mx.symbol.SVMOutput(data=X, label=L)
+    x = mx.nd.empty(shape, ctx = xpu)
+    l = mx.nd.empty((shape[0],), ctx = xpu)
+    x_np = np.random.rand(*shape)
+    x_np = x_np.astype(np.float32)
+    l_np = np.random.randint(0, shape[1], (shape[0],))
+    x[:] = x_np
+    l[:] = l_np
+
+    grad = mx.nd.empty(shape, ctx = xpu)
+    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
+    exec1.forward()
+
+    assert_allclose(x_np, exec1.outputs[0].asnumpy())
+    
+    exec1.backward()
+    
+    l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1]))
+    l_mask = np.array(l_mask, dtype=np.float32)*2 -1
+    grad_np = (-2)*l_mask*np.maximum(1-l_mask*x_np,0)
+    grad_np = grad_np.astype(np.float32)
+    assert_allclose(grad_np, grad.asnumpy())
+
+
 if __name__ == '__main__':
+    test_expand_dims()
+    test_slice_axis()
+    test_softmax()
     test_broadcast_binary_op()
     test_flip()
     test_crop()
@@ -1072,5 +1472,9 @@ def test_flip():
     test_reshape()
     test_reduce()
     test_broadcast()
-    #check_softmax_with_shape((3,4), mx.cpu())
-    #check_multi_softmax_with_shape((3,4,5), mx.cpu())
+    test_stn()
+    test_dot()
+    test_batch_dot()
+    test_correlation()
+    test_support_vector_machine_l1_svm()
+    test_support_vector_machine_l2_svm()
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 10be569e8f76..3ec5029cc0d3 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -23,10 +23,42 @@ def check_with_device(device):
         assert abs(np.mean(un1.asnumpy()) - (a+b)/2) < 0.1
 
 
+def check_symbolic_random(dev):
+    a, b = -10, 10
+    mu, sigma = 10, 2
+    shape = (100, 100)
+    X = mx.sym.Variable("X")
+    Y = mx.sym.uniform(low=a, high=b, shape=shape) + X
+    x = mx.nd.zeros(shape, ctx=dev)
+    xgrad = mx.nd.zeros(shape, ctx=dev)
+    yexec = Y.bind(dev, {'X' : x}, {'X': xgrad})
+    mx.random.seed(128)
+    yexec.forward()
+    yexec.backward(yexec.outputs[0])
+    un1 = (yexec.outputs[0] - x).copyto(dev)
+    assert same(xgrad.asnumpy(), un1.asnumpy())
+    mx.random.seed(128)
+    yexec.forward()
+    un2 = (yexec.outputs[0] - x).copyto(dev)
+    assert same(un1.asnumpy(), un2.asnumpy())
+    assert abs(np.mean(un1.asnumpy()) - (a+b)/2) < 0.1
+
+    Y = mx.sym.normal(loc=mu, scale=sigma, shape=shape)
+    yexec = Y.simple_bind(dev)
+    mx.random.seed(128)
+    yexec.forward()
+    ret1 = yexec.outputs[0].copyto(dev)
+    mx.random.seed(128)
+    ret2 = mx.random.normal(mu, sigma, shape)
+    assert same(ret1.asnumpy(), ret2.asnumpy())
+    assert abs(np.mean(ret1.asnumpy()) - mu) < 0.1
+    assert abs(np.std(ret1.asnumpy()) - sigma) < 0.1
+
+
 def test_random():
     check_with_device(mx.cpu())
+    check_symbolic_random(mx.cpu())
 
 
 if __name__ == '__main__':
     test_random()
-
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index 33fe0072ff26..edbc2f99ebaa 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -74,12 +74,16 @@ if [ ${TASK} == "r_test" ]; then
 
     Rscript tests/travis/r_vignettes.R
 
-    wget http://webdocs.cs.ualberta.ca/~bx3/data/Inception.zip
+    wget http://data.dmlc.ml/mxnet/data/Inception.zip
     unzip Inception.zip && rm -rf Inception.zip
     wget https://s3-us-west-2.amazonaws.com/mxnet/train.csv -O train.csv
     wget https://s3-us-west-2.amazonaws.com/mxnet/test.csv -O test.csv
 
-    cat *.R > r_test.R
+    cat CallbackFunctionTutorial.R \
+    fiveMinutesNeuralNetwork.R \
+    mnistCompetition.R \
+    classifyRealImageWithPretrainedModel.R \
+    ndarrayAndSymbolTutorial.R > r_test.R
 
     Rscript r_test.R || exit -1
 
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 9e7fa00b7490..8e9e581fe66f 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -8,7 +8,6 @@ fi
 if [ ${TRAVIS_OS_NAME} == "osx" ]; then
     brew update
     brew tap homebrew/science
-    brew info opencv
     brew install opencv
     brew install python3
     brew install fftw
diff --git a/tools/caffe_converter/README.md b/tools/caffe_converter/README.md
index 2e6eca1ea40c..3155239daf1d 100644
--- a/tools/caffe_converter/README.md
+++ b/tools/caffe_converter/README.md
@@ -1,6 +1,6 @@
 # Convert Caffe Model to Mxnet Format
 
-### Build
+### Build (Linux)
 
 Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.html) or [Google protobuf](https://developers.google.com/protocol-buffers/?hl=en) is required. The latter is often much easier to install:  
 
@@ -10,9 +10,25 @@ Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.htm
 
 Now we can build the tool by running `make` in the current directory.
 
+### Build (Windows)
+
+Note: this tool currently only works on python 2.
+
+We must make sure that the installed python binding and protobuf compiler are using the same version of protobuf,
+so we install the bindings first, and then install the corresponding compiler.
+
+1. Install the protobuf bindings. At time of writing, the conda package manager has the most up to date version. Either run `conda install -c conda-forge protobuf` or `pip install protobuf`
+2. Download the win32 build of protoc from [Protocol Buffers Releases](https://github.com/google/protobuf/releases). Make sure to download the version that corresponds to the version of the bindings. Extract to any location then add that location to your `PATH`
+3. Run `make_win32.bat` to build the package
+
+
 ### How to use
 
-Use `./run.sh model_name` to download and convert a model. E.g. `./run.sh vgg19`
+Linux: Use `./run.sh model_name` to download and convert a model. E.g. `./run.sh vgg19`
+
+Windows: Use `python convert_model.py prototxt caffemodel outputprefix`  
+For example: `python convert_model.py VGG_ILSVRC_16_layers_deploy.prototxt VGG_ILSVRC_16_layers.caffemodel vgg16`
+
 
 ### Note
 
diff --git a/tools/caffe_converter/caffe_parse/parse_from_protobuf.py b/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
index 6350a20dfb21..865e047507df 100644
--- a/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
+++ b/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
@@ -10,7 +10,7 @@ def parse_caffemodel(filepath):
 
     returns: layers
     '''
-    f = open(filepath)
+    f = open(filepath, 'rb')
     contents = f.read()
 
     netparam = caffe_pb2.NetParameter()
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 113e0b28cf37..891681fb347a 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -63,10 +63,13 @@ def main():
         if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14:
             assert(len(layer_blobs) == 2)
             wmat_dim = []
-            if len(layer_blobs[0].shape.dim) > 0:
-                wmat_dim = layer_blobs[0].shape.dim
+            if getattr(layer_blobs[0].shape, 'dim', None) is not None:
+                if len(layer_blobs[0].shape.dim) > 0:
+                    wmat_dim = layer_blobs[0].shape.dim
+                else:
+                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width]
             else:
-                wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width]
+                wmat_dim = list(layer_blobs[0].shape)
             wmat = np.array(layer_blobs[0].data).reshape(wmat_dim)
             bias = np.array(layer_blobs[1].data)
             if first_conv:
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index b4fea0b0b7c1..a62a25e7e7cf 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -97,9 +97,14 @@ def proto2script(proto_file):
         if layer[i].type == 'Pooling' or layer[i].type == 17:
             type_string = 'mx.symbol.Pooling'
             param = layer[i].pooling_param
-            param_string = "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
-                (param.pad, param.pad, param.kernel_size,\
-                param.kernel_size, param.stride, param.stride)
+            param_string = ''
+            if param.global_pooling == True:
+                # there must be a param `kernel` in a pooling layer
+                param_string += "global_pool=True, kernel=(1,1)"
+            else:
+                param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" %\
+                    (param.pad, param.pad, param.kernel_size,\
+                    param.kernel_size, param.stride, param.stride)
             if param.pool == 0:
                 param_string = param_string + ", pool_type='max'"
             elif param.pool == 1:
@@ -129,9 +134,6 @@ def proto2script(proto_file):
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'Softmax' or layer[i].type == 20:
             type_string = 'mx.symbol.SoftmaxOutput'
-
-            # We only support single output network for now.
-            output_name = name
         if layer[i].type == 'Flatten' or layer[i].type == 8:
             type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
@@ -140,9 +142,16 @@ def proto2script(proto_file):
         if layer[i].type == 'Concat' or layer[i].type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
+        if layer[i].type == 'Crop':
+            type_string = 'mx.symbol.Crop'
+            need_flatten[name] = True
+            param_string = 'center_crop=True'
+        if layer[i].type == 'BatchNorm':
+            type_string = 'mx.symbol.BatchNorm'
+            param = layer[i].batch_norm_param
+            param_string = 'use_global_stats=%s' % param.use_global_stats
         if type_string == '':
             raise Exception('Unknown Layer %s!' % layer[i].type)
-
         if type_string != 'split':
             bottom = layer[i].bottom
             if param_string != "":
@@ -163,6 +172,7 @@ def proto2script(proto_file):
                     (name, type_string, name, ','.join([mapping[x] for x in bottom]), param_string)
         for j in range(len(layer[i].top)):
             mapping[layer[i].top[j]] = name
+        output_name = name
     return symbol_string, output_name, input_dim
 
 def proto2symbol(proto_file):
diff --git a/tools/caffe_converter/make_win32.bat b/tools/caffe_converter/make_win32.bat
new file mode 100644
index 000000000000..7d354dcaeb6c
--- /dev/null
+++ b/tools/caffe_converter/make_win32.bat
@@ -0,0 +1,3 @@
+@protoc --python_out=./ ./caffe_parse/caffe.proto
+@echo done.
+@pause
diff --git a/tools/im2rec.py b/tools/im2rec.py
index 8f2ca9248930..7df1f5a6c72f 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -1,20 +1,21 @@
 import os
 import sys
+
 curr_path = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.join(curr_path, "../python"))
 import mxnet as mx
 import random
-import numpy as np
 import argparse
-import threading
-import cv, cv2
+import cv2
 import time
+
+
 def list_image(root, recursive, exts):
     image_list = []
     if recursive:
         cat = {}
         for path, subdirs, files in os.walk(root, followlinks=True):
-	    subdirs.sort()
+            subdirs.sort()
             print(len(cat), path)
             for fname in files:
                 fpath = os.path.join(path, fname)
@@ -31,50 +32,56 @@ def list_image(root, recursive, exts):
                 image_list.append((len(image_list), os.path.relpath(fpath, root), 0))
     return image_list
 
+
 def write_list(path_out, image_list):
     with open(path_out, 'w') as fout:
-        for i in xrange(len(image_list)):
-            line = '%d\t'%image_list[i][0]
+        n_images = xrange(len(image_list))
+        for i in n_images:
+            line = '%d\t' % image_list[i][0]
             for j in image_list[i][2:]:
-                line += '%f\t'%j
-            line += '%s\n'%image_list[i][1]
+                line += '%d\t' % j
+            line += '%s\n' % image_list[i][1]
             fout.write(line)
 
-def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio, test_ratio):
-    image_list = list_image(root, recursive, exts)
-    random.seed(100)
-    random.shuffle(image_list)
+
+def make_list(args):
+    image_list = list_image(args.root, args.recursive, args.exts)
+    if args.shuffle is True:
+        random.seed(100)
+        random.shuffle(image_list)
     N = len(image_list)
-    chunk_size = (N+num_chunks-1)/num_chunks
-    for i in xrange(num_chunks):
-        chunk = image_list[i*chunk_size:(i+1)*chunk_size]
-        if num_chunks > 1:
-            str_chunk = '_%d'%i
+    chunk_size = (N + args.chunks - 1) / args.chunks
+    for i in xrange(args.chunks):
+        chunk = image_list[i * chunk_size:(i + 1) * chunk_size]
+        if args.chunks > 1:
+            str_chunk = '_%d' % i
         else:
             str_chunk = ''
-        sep = int(chunk_size*train_ratio)
-	sep_test=int(chunk_size*test_ratio)
-        write_list(prefix_out+str_chunk+'_test.lst', chunk[:sep_test])
-        write_list(prefix_out+str_chunk+'_train.lst', chunk[sep_test:sep_test+sep])
-        write_list(prefix_out+str_chunk+'_val.lst', chunk[sep_test+sep:])
+        sep = int(chunk_size * args.train_ratio)
+        sep_test = int(chunk_size * args.test_ratio)
+        write_list(args.prefix + str_chunk + '_test.lst', chunk[:sep_test])
+        write_list(args.prefix + str_chunk + '_train.lst', chunk[sep_test:sep_test + sep])
+        write_list(args.prefix + str_chunk + '_val.lst', chunk[sep_test + sep:])
+
 
 def read_list(path_in):
     image_list = []
     with open(path_in) as fin:
         for line in fin.readlines():
             line = [i.strip() for i in line.strip().split('\t')]
-            item = [int(line[0])] + [line[-1]] + [float(i) for i in line[1:-1]]
+            item = [int(line[0])] + [line[-1]] + [int(i) for i in line[1:-1]]
             image_list.append(item)
     return image_list
 
-def write_record(args, image_list):
+
+def write_record(args, image_list, fname):
     source = image_list
     tic = [time.time()]
     color_modes = {-1: cv2.IMREAD_UNCHANGED,
-                    0: cv2.IMREAD_GRAYSCALE,
-                    1: cv2.IMREAD_COLOR}
+                   0: cv2.IMREAD_GRAYSCALE,
+                   1: cv2.IMREAD_COLOR}
     total = len(source)
-    
+
     def image_encode(item, q_out):
         try:
             img = cv2.imread(os.path.join(args.root, item[1]), color_modes[args.color])
@@ -86,16 +93,16 @@ def image_encode(item, q_out):
             return
         if args.center_crop:
             if img.shape[0] > img.shape[1]:
-                margin = (img.shape[0] - img.shape[1])/2;
-                img = img[margin:margin+img.shape[1], :]
+                margin = (img.shape[0] - img.shape[1]) / 2;
+                img = img[margin:margin + img.shape[1], :]
             else:
-                margin = (img.shape[1] - img.shape[0])/2;
-                img = img[:, margin:margin+img.shape[0]]
+                margin = (img.shape[1] - img.shape[0]) / 2;
+                img = img[:, margin:margin + img.shape[0]]
         if args.resize:
             if img.shape[0] > img.shape[1]:
-                newsize = (img.shape[0]*args.resize/img.shape[1], args.resize)
+                newsize = (args.resize, img.shape[0] * args.resize / img.shape[1])
             else:
-                newsize = (args.resize, img.shape[1]*args.resize/img.shape[0])
+                newsize = (img.shape[1] * args.resize / img.shape[0], args.resize)
             img = cv2.resize(img, newsize)
         header = mx.recordio.IRHeader(0, item[2], item[0], 0)
 
@@ -103,7 +110,7 @@ def image_encode(item, q_out):
             s = mx.recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding)
             q_out.put(('data', s, item))
         except:
-            print 'pack_img error:',item[1]
+            print 'pack_img error:', item[1]
             return
 
     def read_worker(q_in, q_out):
@@ -111,14 +118,16 @@ def read_worker(q_in, q_out):
             item = q_in.get()
             image_encode(item, q_out)
 
-    def write_worker(q_out, prefix):
+    def write_worker(q_out, fname, saving_folder):
         pre_time = time.time()
         sink = []
-	record = mx.recordio.MXRecordIO(prefix+'.rec', 'w')
+        os.chdir(saving_folder)
+        fname_rec = fname[:fname.rfind('.')]
+        record = mx.recordio.MXRecordIO(fname_rec + '.rec', 'w')
         while True:
             stat, s, item = q_out.get()
             if stat == 'finish':
-                write_list(prefix+'.lst', sink)
+                write_list(fname_rec + '.lst', sink)
                 break
             record.write(s)
             sink.append(item)
@@ -134,10 +143,10 @@ def write_worker(q_out, prefix):
         for i in range(len(image_list)):
             q_in[i % len(q_in)].put(image_list[i])
         read_process = [multiprocessing.Process(target=read_worker, args=(q_in[i], q_out)) \
-                for i in range(args.num_thread)]
+                        for i in range(args.num_thread)]
         for p in read_process:
             p.start()
-        write_process = multiprocessing.Process(target=write_worker, args=(q_out,args.prefix))
+        write_process = multiprocessing.Process(target=write_worker, args=(q_out, fname, args.saving_folder))
         write_process.start()
         for p in read_process:
             p.join()
@@ -147,7 +156,9 @@ def write_worker(q_out, prefix):
         print('multiprocessing not available, fall back to single threaded encoding')
         import Queue
         q_out = Queue.Queue()
-	record = mx.recordio.MXRecordIO(args.prefix+'.rec', 'w')
+        os.chdir(args.saving_folder)
+        fname_rec = fname[:fname.rfind('.')]
+        record = mx.recordio.MXRecordIO(fname_rec + '.rec', 'w')
         cnt = 0
         pre_time = time.time()
         for item in image_list:
@@ -162,73 +173,66 @@ def write_worker(q_out, prefix):
                 print 'time:', cur_time - pre_time, ' count:', cnt
                 pre_time = cur_time
 
+
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
         description='Create an image list or \
-    	make a record database by reading from an image list')
+        make a record database by reading from an image list')
     parser.add_argument('prefix', help='prefix of input/output files.')
     parser.add_argument('root', help='path to folder containing images.')
 
     cgroup = parser.add_argument_group('Options for creating image lists')
     cgroup.add_argument('--list', type=bool, default=False,
-        help='If this is set im2rec will create image list(s) by traversing root folder\
+                        help='If this is set im2rec will create image list(s) by traversing root folder\
         and output to <prefix>.lst.\
         Otherwise im2rec will read <prefix>.lst and create a database at <prefix>.rec')
-    cgroup.add_argument('--exts', type=list, default=['.jpeg','.jpg'],
-        help='list of acceptable image extensions.')
+    cgroup.add_argument('--exts', type=list, default=['.jpeg', '.jpg'],
+                        help='list of acceptable image extensions.')
     cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.')
-    cgroup.add_argument('--train_ratio', type=float, default=1.0,
-        help='Ratio of images to use for training.')
-    cgroup.add_argument('--test_ratio', type=float, default=0,
-	help='Ratio of images to use for testing.')
+    cgroup.add_argument('--train-ratio', type=float, default=1.0,
+                        help='Ratio of images to use for training.')
+    cgroup.add_argument('--test-ratio', type=float, default=0,
+                        help='Ratio of images to use for testing.')
     cgroup.add_argument('--recursive', type=bool, default=False,
-        help='If true recursively walk through subdirs and assign an unique label\
+                        help='If true recursively walk through subdirs and assign an unique label\
         to images in each folder. Otherwise only include images in the root folder\
         and give them label 0.')
 
     rgroup = parser.add_argument_group('Options for creating database')
     rgroup.add_argument('--resize', type=int, default=0,
-        help='resize the shorter edge of image to the newsize, original images will\
+                        help='resize the shorter edge of image to the newsize, original images will\
         be packed by default.')
-    rgroup.add_argument('--center_crop', type=bool, default=False,
-        help='specify whether to crop the center image to make it rectangular.')
+    rgroup.add_argument('--center-crop', type=bool, default=False,
+                        help='specify whether to crop the center image to make it rectangular.')
     rgroup.add_argument('--quality', type=int, default=80,
-        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
+                        help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9')
     rgroup.add_argument('--num_thread', type=int, default=1,
-        help='number of thread to use for encoding. order of images will be different\
+                        help='number of thread to use for encoding. order of images will be different\
         from the input list if >1. the input list will be modified to match the\
         resulting order.')
     rgroup.add_argument('--color', type=int, default=1, choices=[-1, 0, 1],
-        help='specify the color mode of the loaded image.\
+                        help='specify the color mode of the loaded image.\
         1: Loads a color image. Any transparency of image will be neglected. It is the default flag.\
         0: Loads image in grayscale mode.\
         -1:Loads image as such including alpha channel.')
     rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'],
-        help='specify the encoding of the images.')
-    rgroup.add_argument('--shuffle', action='store_true',
-        help='If this is set and --list is not, im2rec will randomize the image order\
-        in <prefix>.lst and <prefix>.rec.')
-
+                        help='specify the encoding of the images.')
+    rgroup.add_argument('--saving-folder', type=str, default='.',
+                        help='folder in which .rec files will be saved.')
+    rgroup.add_argument('--shuffle', default=True, help='If this is set as True, \
+        im2rec will randomize the image order in <prefix>.lst')
     args = parser.parse_args()
-    
     if args.list:
-        make_list(args.prefix, args.root, args.recursive,
-                  args.exts, args.chunks, args.train_ratio, args.test_ratio)
+        make_list(args)
     else:
         files = [f for f in os.listdir('.') if os.path.isfile(f)]
         for f in files:
-        # do something
-            #print 'path: ', path
-            #print 'subdirs: ', subdirs
-            print 'current file: ', f
-            if f.startswith(args.prefix) is True:
-                print 'OK'
+            if f.startswith(args.prefix) is True and f.endswith('.lst') is True:
+                print 'Creating .rec file from', f, 'in', args.saving_folder
                 image_list = read_list(f)
-                if args.shuffle:
-                    random.shuffle(image_list)
-                write_record(args, image_list)
-            else:
-                print 'not OK'
+                write_record(args, image_list, f)
+
+
 if __name__ == '__main__':
     main()
diff --git a/tools/make_list.py b/tools/make_list.py
deleted file mode 100644
index 578f2c4c3283..000000000000
--- a/tools/make_list.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import os
-import random
-import numpy as np
-import argparse
-
-def list_image(root, recursive, exts):
-    image_list = []
-    if recursive:
-        cat = {}
-        for path, subdirs, files in os.walk(root):
-            print path
-            for fname in files:
-                fpath = os.path.join(path, fname)
-                suffix = os.path.splitext(fname)[1].lower()
-                if os.path.isfile(fpath) and (suffix in exts):
-                    if path not in cat:
-                        cat[path] = len(cat)
-                    image_list.append((os.path.relpath(fpath, root), cat[path]))
-    else:
-        for fname in os.listdir(root):
-            fpath = os.path.join(root, fname)
-            suffix = os.path.splitext(fname)[1].lower()
-            if os.path.isfile(fpath) and (suffix in exts):
-                image_list.append((os.path.relpath(fpath, root), 0))
-    return image_list
-
-def write_list(path_out, image_list):
-    with open(path_out, 'w') as fout:
-        for i in xrange(len(image_list)):
-            fout.write('%d\t%d\t%s\n'%(i, image_list[i][1], image_list[i][0]))
-
-
-def make_list(prefix_out, root, recursive, exts, num_chunks, train_ratio):
-    image_list = list_image(root, recursive, exts)
-    random.shuffle(image_list)
-    N = len(image_list)
-    chunk_size = (N+num_chunks-1)/num_chunks
-    for i in xrange(num_chunks):
-        chunk = image_list[i*chunk_size:(i+1)*chunk_size]
-        if num_chunks > 1:
-            str_chunk = '_%d'%i
-        else:
-            str_chunk = ''
-        if train_ratio < 1:
-            sep = int(chunk_size*train_ratio)
-            write_list(prefix_out+str_chunk+'_train.lst', chunk[:sep])
-            write_list(prefix_out+str_chunk+'_val.lst', chunk[sep:])
-        else:
-            write_list(prefix_out+str_chunk+'.lst', chunk)
-
-def main():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description='Make image list files that are\
-        required by im2rec')
-    parser.add_argument('root', help='path to folder that contain images.')
-    parser.add_argument('prefix', help='prefix of output list files.')
-    parser.add_argument('--exts', type=str, nargs='+', default=['.jpeg','.jpg'],
-        help='list of acceptable image extensions.')
-    parser.add_argument('--chunks', type=int, default=1, help='number of chunks.')
-    parser.add_argument('--train_ratio', type=float, default=1.0,
-        help='Percent of images to use for training.')
-    parser.add_argument('--recursive', type=bool, default=False,
-        help='If true recursively walk through subdirs and assign an unique label\
-        to images in each folder. Otherwise only include images in the root folder\
-        and give them label 0.')
-    args = parser.parse_args()
-    
-    make_list(args.prefix, args.root, args.recursive,
-        args.exts, args.chunks, args.train_ratio)
-
-if __name__ == '__main__':
-    main()