diff --git a/.gitignore b/.gitignore
index 5ada46a483c1..04153e1f7136 100644
--- a/.gitignore
+++ b/.gitignore
@@ -62,9 +62,11 @@ __pycache__
 *.json
 *.d
 build
+cmake-build-debug
+cmake-build-release
+cmake-build
 data
 recommonmark
-bin
 deps
 
 # R
@@ -85,6 +87,7 @@ R-package/R/mxnet_generated.R
 *ubyte
 *.bin
 *.txt
+!CMakeLists.txt
 
 # ipython notebook
 *_pb2.py
@@ -141,7 +144,12 @@ tools/pip_package/mxnet.egg-info
 tools/pip_package/mxnet
 
 # temporary path for building dependencies when building wheel
-deps/
-
+./deps/
+bld
+./tmp/*
 *.jar
-target
\ No newline at end of file
+target
+cmake-build
+
+
+bin/im2rec
diff --git a/.gitmodules b/.gitmodules
index 63d0c4cbeb04..08f2bc99f2aa 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,4 +13,4 @@
 [submodule "cub"]
 	path = cub
 	url = https://github.com/NVlabs/cub
-	shallow = true
+  shallow=true
diff --git a/.travis.yml b/.travis.yml
index 0cb5b33f6870..c8ba0b1e645b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,6 +6,8 @@ os:
   # - linux
   - osx
 
+osx_image: xcode8
+
 env:
   # code analysis
   # - TASK=lint
@@ -21,9 +23,19 @@ env:
 
   # TODO, R test, distributed test, clang, more g++ versions
 
-# matrix:
-#   exclude:
-#     - os: osx
+matrix:
+  include:
+    - # os: linux
+      # dist: trusty
+      # env: TASK=perl_test
+    - os: osx
+      ## sudo is required because
+      ## prexexisting packages conflict
+      ## with new ones.
+      ## would be nice to have macports
+      ## on travis osx, it has all needed perl packages
+      sudo: required
+      env: TASK=perl_test
 #       env: TASK=julia JULIA_VER=0.4
 #     - os: linux
 #       env: TASK=build
@@ -57,6 +69,11 @@ addons:
       - python3-dev
       - python3-nose
       - graphviz
+      - libmouse-perl
+      - pdl
+      - cpanminus
+      - swig
+      - libgraphviz-perl
 
 before_install:
   - export NVCC_PREFIX=${HOME}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fe5cf32ae68a..4c94073fe419 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.7)
+cmake_minimum_required(VERSION 3.0.2)
 
 project(mxnet C CXX)
 
@@ -9,18 +9,29 @@ endif()
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}")
 
 include(cmake/Utils.cmake)
-mxnet_option(USE_OPENCV  "Build with OpenCV support" ON)
-mxnet_option(USE_OPENMP  "Build with Openmp support" ON)
-mxnet_option(USE_CUDNN   "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
-mxnet_option(USE_CUDA    "Build with CUDA support"   ON)
-mxnet_option(USE_PROFILER "Build with Profiler support"   OFF)
-mxnet_option(USE_DIST_KVSTORE    "Build with DIST_KVSTORE support"   OFF)
+mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
+mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
+mxnet_option(USE_CUDA             "Build with CUDA support"   ON)
+mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
+mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
+mxnet_option(USE_MKLML_MKL        "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
+mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
+mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   OFF)
+mxnet_option(USE_PROFILER         "Build with Profiler support"   OFF)
+mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
 mxnet_option(USE_PLUGINS_WARPCTC	"Use WARPCTC Plugins" OFF)
-mxnet_option(USE_PLUGIN_CAFFE "Use Caffe Plugin" OFF)
+mxnet_option(USE_PLUGIN_CAFFE     "Use Caffe Plugin" OFF)
+mxnet_option(USE_CPP_PACKAGE      "Build C++ Package" OFF)
 mxnet_option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 
 SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 
+if("$ENV{VERBOSE}" STREQUAL "1")
+  message(STATUS " Verbose Makefile ACTIVATED")
+  set(CMAKE_VERBOISE_MAKEFILE ON)
+endif()
+
+
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-DDMLC_USE_CXX11)
@@ -51,6 +62,37 @@ else(MSVC)
   endif()
 endif(MSVC)
 
+set(mxnet_LINKER_LIBS "")
+
+if(USE_MKL_IF_AVAILABLE)
+  if(USE_MKL_EXPERIMENTAL AND NOT USE_MKLML_MKL)
+    message(ERROR " USE_MKL_EXPERIMENTAL can only be used when USE_MKL_EXPERIMENTAL is enabled")
+  endif()
+  find_package(MKL)
+  if(MKL_FOUND)
+    include_directories(${MKL_INCLUDE_DIR})
+    include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl)
+    add_definitions(-DMXNET_USE_MKL2017=1)
+    add_definitions(-DUSE_MKL=1)
+    add_definitions(-DCUB_MKL=1)
+    list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES})
+    if(NOT MSVC)
+      list(APPEND mxnet_LINKER_LIBS dl)
+    endif()
+    if(USE_MKL_EXPERIMENTAL)
+      add_definitions(-DMKL_EXPERIMENTAL=1)
+    else()
+      add_definitions(-DMKL_EXPERIMENTAL=0)
+    endif()
+  else()
+    message(STATUS " MKL not found")
+  endif()
+endif()
+
+# Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
+
 if(EXISTS ${PROJECT_SOURCE_DIR}/mshadow/cmake)
   include(mshadow/cmake/mshadow.cmake)
   include(mshadow/cmake/Utils.cmake)
@@ -61,21 +103,40 @@ else()
   include(mshadow)
 endif()
 
+list(APPEND mxnet_LINKER_LIBS ${mshadow_LINKER_LIBS})
+
 foreach(var ${C_CXX_INCLUDE_DIRECTORIES})
  	include_directories(${var})
 endforeach()
 
-set(mxnet_LINKER_LIBS "")
-set(mxnet_LINKER_LIBS_DEBUG "")
-set(mxnet_LINKER_LIBS_RELEASE "")
-list(APPEND mxnet_LINKER_LIBS ${mshadow_LINKER_LIBS})
-
 include_directories("include")
 include_directories("mshadow")
 include_directories("cub")
 include_directories("nnvm/include")
 include_directories("dmlc-core/include")
 
+if(NOT MSVC)
+  set(BEGIN_WHOLE_ARCHIVE -Wl,--whole-archive)
+  set(END_WHOLE_ARCHIVE -Wl,--no-whole-archive)
+endif()
+
+if(UNIX)
+  find_library(RTLIB rt)
+  if(RTLIB)
+    list(APPEND mxnet_LINKER_LIBS ${RTLIB})
+  endif()
+endif()
+
+# ---[ jemalloc
+if(USE_JEMALLOC)
+  find_package(JeMalloc)
+  if(JEMALLOC_FOUND)
+    add_definitions(-DUSE_JEMALLOC)
+    include_directories(${JEMALLOC_INCLUDE_DIRS})
+    set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${JEMALLOC_LIBRARIES})
+  endif()
+endif()
+
 if(USE_OPENCV)
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
@@ -83,15 +144,19 @@ if(USE_OPENCV)
   endif()
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
   list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
+  message(STATUS " OpenCV_LIBS=${OpenCV_LIBS}")
   message(STATUS "OpenCV found (${OpenCV_CONFIG_PATH})")
   add_definitions(-DMXNET_USE_OPENCV=1)
+  if(NOT MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-undefined")
+  endif()
 else(USE_OPENCV)
   message(STATUS "OpenCV Disabled")
   add_definitions(-DMXNET_USE_OPENCV=0)
 endif()
 
 if(USE_OPENMP)
-  FIND_PACKAGE( OpenMP REQUIRED)
+  find_package(OpenMP REQUIRED)
   if(OPENMP_FOUND)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
@@ -100,6 +165,25 @@ if(USE_OPENMP)
   endif()
 endif()
 
+if(UNIX)
+  find_library(RTLIB rt)
+  if(RTLIB)
+    list(APPEND mxnet_LINKER_LIBS ${RTLIB})
+  endif()
+endif()
+
+# ---[ jemalloc
+if(USE_JEMALLOC)
+  find_package(JeMalloc)
+  if(JEMALLOC_FOUND)
+    add_definitions(-DUSE_JEMALLOC)
+    include_directories(${JEMALLOC_INCLUDE_DIRS})
+    set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${JEMALLOC_LIBRARIES})
+  endif()
+endif()
+
+include(CTest)
+
 # cudnn detection
 if(USE_CUDNN AND USE_CUDA)
   detect_cuDNN()
@@ -111,15 +195,10 @@ if(USE_CUDNN AND USE_CUDA)
   endif()
 endif()
 
-if(EXISTS ${PROJECT_SOURCE_DIR}/dmlc-core/cmake)
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/cmake)
   add_subdirectory("dmlc-core")
 endif()
 
-if(USE_DIST_KVSTORE)
-  if(EXISTS ${PROJECT_SOURCE_DIR}/ps-lite/CMakeLists.txt)
-	  add_subdirectory("ps-lite")
-  endif()
-endif()
 
 mxnet_source_group("Include\\common"   GLOB "src/common/*.h")
 mxnet_source_group("Include\\c_api"   GLOB "src/c_api/*.h")
@@ -199,8 +278,7 @@ if(USE_PLUGINS_WARPCTC)
 	set(WARPCTC_INCLUDE  "" CACHE PATH "WARPCTC include")
     set(WARPCTC_LIB_DEBUG  "" CACHE FILEPATH "WARPCTC lib")
     set(WARPCTC_LIB_RELEASE  "" CACHE FILEPATH "WARPCTC lib")
-    set(mxnet_LINKER_LIBS_RELEASE ${WARPCTC_LIB_RELEASE})
-    set(mxnet_LINKER_LIBS_DEBUG ${WARPCTC_LIB_DEBUG})
+
 
 	include_directories(SYSTEM ${WARPCTC_INCLUDE})
 	list(APPEND mxnet_LINKER_LIBS ${WARPCTC_LIB})
@@ -214,8 +292,12 @@ if(USE_PLUGINS_WARPCTC)
 endif()
 
 if(USE_PLUGIN_CAFFE)
+  if(NOT USE_CUDA)
+    set(CPU_ONLY ON)
+    add_definitions(-DCPU_ONLY=1)
+  endif()
   if(NOT DEFINED CAFFE_PATH)
-    if(EXISTS ${PROJECT_SOURCE_DIR}/caffe)
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/caffe)
       # Need newer FindCUDA.cmake that correctly handles -std=c++11
       cmake_minimum_required(VERSION 3.3)
       set(CAFFE_PATH ${PROJECT_SOURCE_DIR}/caffe)
@@ -237,6 +319,7 @@ if(USE_PLUGIN_CAFFE)
   list(APPEND SOURCE ${PLUGINS_SOURCE})
   list(APPEND CUDA ${PLUGINS_CUSRC})
   include_directories(${CMAKE_BINARY_DIR}/include)
+  add_definitions(-DMXNET_USE_CAFFE=1)
   list(APPEND mxnet_LINKER_LIBS
     protobuf boost_system boost_thread boost_filesystem
     gflags glog caffe
@@ -271,27 +354,39 @@ if(USE_CUDA)
     list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
     set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
     list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
+    FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+    list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
   else(MSVC)
-    list(APPEND mxnet_LINKER_LIBS nvrtc cuda)
+    list(APPEND mxnet_LINKER_LIBS nvrtc cuda cufft)
     link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
   endif()
   list(APPEND SOURCE ${cuda_objs} ${CUDA})
   add_definitions(-DMXNET_USE_CUDA=1)
   add_definitions(-DMXNET_USE_NVRTC=1)
   if(CUDA_LIBRARY_PATH)
-    link_directories(${CUDA_LIBRARY_PATH}/stubs)
+    if(IS_CONTAINER_BUILD)
+      # In case of building on a production-like build container which may not have Cuda installed
+      if(NOT CMAKE_SYSTEM_HAS_CUDA)
+        # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine)
+        # so use the stub cuda driver shared library
+        if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so)
+          link_directories(${CUDA_LIBRARY_PATH}/stubs)
+        endif()
+      endif()
+    endif()
   endif()
 endif()
 
 # unsupported: if caffe is a subdirectory of mxnet, load its CMakeLists.txt as well
 if(USE_PLUGIN_CAFFE)
-  if(EXISTS ${PROJECT_SOURCE_DIR}/caffe)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/caffe)
     add_subdirectory(caffe)
   endif()
 endif()
 
 if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/nnvm/CMakeLists.txt")
-  list(APPEND mxnet_LINKER_LIBS nnvm)
+  set(nnvm_LINKER_LIBS nnvm)
+  list(APPEND mxnet_LINKER_LIBS ${nnvm_LINKER_LIBS})
 endif()
 
 if(NOT MSVC)
@@ -307,44 +402,98 @@ endif()
 if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin" AND USE_MXNET_LIB_NAMING)
   add_library(mxnet MODULE ${SOURCE})
 else()
-  add_library(mxnet SHARED ${SOURCE})
+  if(UNIX)
+    set(MXNET_DYNAMIC_ONLY ON)
+    if(MXNET_DYNAMIC_ONLY)
+      add_library(mxnet SHARED ${SOURCE})
+    else()
+      set(INITIALIZE_SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/src/initialize.cc)
+      list(REMOVE_ITEM SOURCE ${INITIALIZE_SOURCE_FILE})
+      add_library(mxnet_static STATIC ${INITIALIZE_SOURCE_FILE} ${SOURCE})
+      # Need an arbitrary source file to trigger CMake to build the library
+      add_library(mxnet SHARED ${INITIALIZE_SOURCE_FILE})
+      # This has prolems, as it adds libmxnet_static to INTERFACE_LINK_LIBRARIES
+      target_link_libraries(mxnet "-Wl,--whole-archive $<TARGET_FILE:mxnet_static> -Wl,--no-whole-archive")
+      #target_link_libraries(mxnet mxnet_static)
+      add_custom_target(
+          StaticallyLinkStaticMXNetLibrary ALL
+          BYPRODUCTS ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/libmxnet.a
+          WORKING_DIRECTORY ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}
+          COMMAND ln -sf libmxnet_static.a libmxnet.a
+          DEPENDS mxnet_static
+      )
+    endif()
+  else()
+    add_library(mxnet SHARED ${SOURCE})
+  endif()
 endif()
 target_link_libraries(mxnet ${mxnet_LINKER_LIBS})
 
 if(USE_PLUGINS_WARPCTC)
-  target_link_libraries(mxnet debug ${mxnet_LINKER_LIBS_DEBUG})
-  target_link_libraries(mxnet optimized ${mxnet_LINKER_LIBS_RELEASE})
+  target_link_libraries(mxnet debug ${WARPCTC_LIB_DEBUG})
+  target_link_libraries(mxnet optimized ${WARPCTC_LIB_RELEASE})
 endif()
 
 target_link_libraries(mxnet dmlccore)
 
-
 if(MSVC AND USE_MXNET_LIB_NAMING)
   set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
-
 endif()
 
+
 if(USE_DIST_KVSTORE)
-	add_definitions(-DMXNET_USE_DIST_KVSTORE)
-	target_link_libraries(mxnet pslite)
-	target_link_libraries(mxnet ${pslite_LINKER_LIBS})
-	include_directories(SYSTEM ${pslite_INCLUDE_DIR})
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ps-lite/CMakeLists.txt)
+    add_subdirectory("ps-lite")
+    list(APPEND pslite_LINKER_LIBS pslite)
+    target_link_libraries(mxnet debug ${pslite_LINKER_LIBS_DEBUG})
+    target_link_libraries(mxnet optimized ${pslite_LINKER_LIBS_RELEASE})
+  else()
+    set(pslite_LINKER_LIBS protobuf zmq-static )
+  endif()
+  add_definitions(-DMXNET_USE_DIST_KVSTORE)
+  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
+  include_directories(SYSTEM ${pslite_INCLUDE_DIR})
 endif()
 
 if(USE_PROFILER)
 	add_definitions(-DMXNET_USE_PROFILER)
 endif()
 
+# Do tests after chrpath so that we use the "real" cuda driver
+add_subdirectory(tests)
+
+# AUTO_INSTALL_DIR -> Optional: specify post-build install direcory
+if(AUTO_INSTALL_DIR)
+  # ---[ Install Includes
+  add_custom_command(TARGET mxnet POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+    ${PROJECT_SOURCE_DIR}/include ${AUTO_INSTALL_DIR}/include
+    )
+
+  # ---[ Install Examples
+  add_custom_command(TARGET mxnet POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory
+    ${PROJECT_SOURCE_DIR}/example ${AUTO_INSTALL_DIR}/example
+    )
+endif()
+
 if(INSTALL_PYTHON_VERSIONS)
+  message(STATUS "Installing for python versions: ${INSTALL_PYTHON_VERSIONS}")
   foreach(version ${INSTALL_PYTHON_VERSIONS})
     set(outdir ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/python${version}/site-packages/mxnet)
     add_custom_command(TARGET mxnet POST_BUILD
       COMMAND mkdir -p ${outdir}
-      COMMAND cp -rul ${CMAKE_CURRENT_SOURCE_DIR}/python/mxnet/* ${outdir}
+      COMMAND cp -ru ${CMAKE_CURRENT_SOURCE_DIR}/python/mxnet/* ${outdir}
       )
   endforeach()
 endif()
 
+if(USE_CPP_PACKAGE)
+  add_subdirectory(cpp-package)
+endif()
+
+add_subdirectory(example/image-classification/predict-cpp)
+
 # ---[ Linter target
 if(MSVC)
   find_package(PythonInterp)
@@ -353,5 +502,3 @@ endif()
 set(LINT_DIRS include src scripts python)
 add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${PROJECT_SOURCE_DIR} -DPROJECT_NAME=mxnet -P ${PROJECT_SOURCE_DIR}/dmlc-core/cmake/lint.cmake)
 
-add_subdirectory(tests/cpp)
-add_subdirectory(example/image-classification/predict-cpp)
\ No newline at end of file
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 3db3e4e448d1..7c01c62e433f 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -40,11 +40,11 @@ The committers are the granted write access to the project.
 * [Yuan Tang](https://github.com/terrytangyuan)
   - Yuan is one of major maintainers of mxnet scala package.
 
-### Become a Comitter
-MXNet is a opensource project and we are actively looking for new comitters
-who are willing to help maintaining and lead the project. Committers comes from contributors who:
+### Become a Committer
+MXNet is a opensource project and we are actively looking for new committers
+who are willing to help maintaining and leading the project. Committers come from contributors who:
 * Made substantial contribution to the project.
-* Willing to actively spent time on maintaining and lead the project.
+* Willing to actively spend time on maintaining and leading the project.
 
 New committers will be proposed by current committers, with support from more than two of current committers.
 
@@ -122,3 +122,12 @@ List of Contributors
 * [Yu Du](https://github.com/Answeror)
 * [Xu Dong](https://github.com/dsqx71)
 * [Chihiro Komaki](https://github.com/ckomaki)
+* [Piyush Singh](https://github.com/Piyush3dB)
+* [Freddy Chua](https://github.com/freddycct)
+* [Jie Zhang](https://github.com/luoyetx)
+* [Leonard Lausen](https://github.com/leezu)
+* [Sergey Kolychev](https://github.com/sergeykolychev)
+  - Sergey is original author and current maintainer of Perl5 interface.
+* [Robert Stone](https://github.com/tlby)
+* [Pedro Larroy](https://github.com/larroy)
+* [Jun Wu](https://github.com/reminisce)
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 000000000000..35fb2d70881a
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,322 @@
+// -*- mode: groovy -*-
+// Jenkins pipeline
+// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
+
+// mxnet libraries
+mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a'
+// command to start a docker container
+docker_run = 'tests/ci_build/ci_build.sh'
+// timeout in minutes
+max_time = 60
+
+// initialize source codes
+def init_git() {
+  checkout scm
+  retry(5) {
+    timeout(time: 2, unit: 'MINUTES') {
+      sh 'git submodule update --init'
+    }
+  }
+}
+
+def init_git_win() {
+    checkout scm
+    retry(5) {
+        timeout(time: 2, unit: 'MINUTES') {
+            bat 'git submodule update --init'
+        }
+    }
+}
+
+stage("Sanity Check") {
+  timeout(time: max_time, unit: 'MINUTES') {
+    node('linux') {
+      ws('workspace/sanity') {
+        init_git()
+        make('lint', 'cpplint rcpplint jnilint')
+        make('lint', 'pylint')
+      }
+    }
+  }
+}
+
+// Run make. First try to do an incremental make from a previous workspace in hope to
+// accelerate the compilation. If something wrong, clean the workspace and then
+// build from scratch.
+def make(docker_type, make_flag) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    try {
+      sh "${docker_run} ${docker_type} make ${make_flag}"
+    } catch (exc) {
+      echo 'Incremental compilation failed. Fall back to build from scratch'
+      sh "${docker_run} ${docker_type} make clean"
+      sh "${docker_run} ${docker_type} make ${make_flag}"
+    }
+  }
+}
+
+// pack libraries for later use
+def pack_lib(name, libs=mx_lib) {
+  sh """
+echo "Packing ${libs} into ${name}"
+echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+"""
+  stash includes: libs, name: name
+}
+
+
+// unpack libraries saved before
+def unpack_lib(name, libs=mx_lib) {
+  unstash name
+  sh """
+echo "Unpacked ${libs} from ${name}"
+echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
+"""
+}
+
+stage('Build') {
+  parallel 'CPU: Openblas': {
+    node('linux') {
+      ws('workspace/build-cpu') {
+        init_git()
+        def flag = """ \
+USE_PROFILER=1                \
+USE_BLAS=openblas             \
+-j\$(nproc)
+"""
+        make("cpu", flag)
+        pack_lib('cpu')
+      }
+    }
+  },
+  'GPU: CUDA7.5+cuDNN5': {
+    node('GPU' && 'linux') {
+      ws('workspace/build-gpu') {
+        init_git()
+        def flag = """ \
+USE_PROFILER=1                \
+USE_BLAS=openblas             \
+USE_CUDA=1                    \
+USE_CUDA_PATH=/usr/local/cuda \
+USE_CUDNN=1                   \
+-j\$(nproc)
+"""
+        make('gpu', flag)
+        pack_lib('gpu')
+      }
+    }
+  },
+  'Amalgamation': {
+    node('linux') {
+      ws('workspace/amalgamation') {
+        init_git()
+        make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1')
+      }
+    }
+  },
+  'GPU: MKLML': {
+    node('GPU' && 'linux') {
+      ws('workspace/build-mklml') {
+        init_git()
+        def flag = """ \
+USE_PROFILER=1                \
+USE_BLAS=openblas             \
+USE_MKL2017=1                 \
+USE_MKL2017_EXPERIMENTAL=1    \
+USE_CUDA=1                    \
+USE_CUDA_PATH=/usr/local/cuda \
+USE_CUDNN=1                   \
+-j\$(nproc)
+"""
+        make('mklml_gpu', flag)
+        pack_lib('mklml')
+      }
+    }
+  },
+  'CPU windows':{
+    node('windows') {
+      ws('workspace/build-cpu') {
+        withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
+          init_git_win()
+          bat """mkdir build_vc14_cpu
+cd build_vc14_cpu
+cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
+          bat 'C:\\mxnet\\build_vc14_cpu.bat'
+
+          bat '''rmdir /s/q pkg_vc14_gpu
+mkdir pkg_vc14_cpu\\lib
+mkdir pkg_vc14_cpu\\python 
+mkdir pkg_vc14_cpu\\include
+mkdir pkg_vc14_cpu\\build
+copy build_vc14_cpu\\Release\\libmxnet.lib pkg_vc14_cpu\\lib
+copy build_vc14_cpu\\Release\\libmxnet.dll pkg_vc14_cpu\\build
+xcopy python pkg_vc14_cpu\\python /E /I /Y
+xcopy include pkg_vc14_cpu\\include /E /I /Y
+xcopy dmlc-core\\include pkg_vc14_cpu\\include /E /I /Y
+xcopy mshadow\\mshadow pkg_vc14_cpu\\include\\mshadow /E /I /Y
+xcopy nnvm\\include pkg_vc14_cpu\\nnvm\\include /E /I /Y
+del /Q *.7z
+7z.exe a vc14_cpu.7z pkg_vc14_cpu\\
+'''
+          stash includes: 'vc14_cpu.7z', name: 'vc14_cpu'
+         }
+        }
+       }
+     },
+     'GPU windows':{
+       node('windows') {
+         ws('workspace/build-gpu') {
+           withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
+             init_git_win()
+             bat """mkdir build_vc14_gpu
+call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
+cd build_vc14_gpu
+cmake -G \"NMake Makefiles JOM\" -DUSE_CUDA=1 -DUSE_CUDNN=1 -DUSE_NVRTC=1 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_DIST_KVSTORE=0 -DCUDA_ARCH_NAME=All -DCMAKE_CXX_FLAGS_RELEASE="/FS /MD /O2 /Ob2 /DNDEBUG" -DCMAKE_BUILD_TYPE=Release ${env.WORKSPACE}"""
+             bat 'C:\\mxnet\\build_vc14_gpu.bat'
+             bat '''rmdir /s/q pkg_vc14_gpu
+mkdir pkg_vc14_gpu\\lib
+mkdir pkg_vc14_gpu\\python 
+mkdir pkg_vc14_gpu\\include
+mkdir pkg_vc14_gpu\\build
+copy build_vc14_gpu\\libmxnet.lib pkg_vc14_gpu\\lib
+copy build_vc14_gpu\\libmxnet.dll pkg_vc14_gpu\\build
+xcopy python pkg_vc14_gpu\\python /E /I /Y
+xcopy include pkg_vc14_gpu\\include /E /I /Y
+xcopy dmlc-core\\include pkg_vc14_gpu\\include /E /I /Y
+xcopy mshadow\\mshadow pkg_vc14_gpu\\include\\mshadow /E /I /Y
+xcopy nnvm\\include pkg_vc14_gpu\\nnvm\\include /E /I /Y
+del /Q *.7z
+7z.exe a vc14_gpu.7z pkg_vc14_gpu\\
+'''
+             stash includes: 'vc14_gpu.7z', name: 'vc14_gpu'
+           }
+         }
+       }
+  }
+}
+
+// Python unittest for CPU
+def python_ut(docker_type) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/unittest"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest"
+  }
+}
+
+// GPU test has two parts. 1) run unittest on GPU, 2) compare the results on
+// both CPU and GPU
+def python_gpu_ut(docker_type) {
+  timeout(time: max_time, unit: 'MINUTES') {
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/gpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/gpu"
+  }
+}
+
+stage('Unit Test') {
+  parallel 'Python2/3: CPU': {
+    node('linux') {
+      ws('workspace/ut-python-cpu') {
+        init_git()
+        unpack_lib('cpu')
+        python_ut('cpu')
+      }
+    }
+  },
+  'Python2/3: GPU': {
+    node('GPU' && 'linux') {
+      ws('workspace/ut-python-gpu') {
+        init_git()
+        unpack_lib('gpu', mx_lib)
+        python_gpu_ut('gpu')
+      }
+    }
+  },
+  'Python2/3: MKLML': {
+    node('GPU' && 'linux') {
+      ws('workspace/ut-python-mklml') {
+        init_git()
+        unpack_lib('mklml')
+        python_ut('mklml_gpu')
+        python_gpu_ut('mklml_gpu')
+      }
+    }
+  },
+  'Scala: CPU': {
+    node('linux') {
+      ws('workspace/ut-scala-cpu') {
+        init_git()
+        unpack_lib('cpu')
+        timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} cpu make scalapkg USE_BLAS=openblas"
+          sh "${docker_run} cpu make scalatest USE_BLAS=openblas"
+        }
+      }
+    }
+  },
+  'Python2/3: CPU Win':{
+    node('windows') {
+      ws('workspace/ut-python-cpu') {
+        init_git_win()
+        unstash 'vc14_cpu'
+        bat '''rmdir /s/q pkg_vc14_cpu
+7z x -y vc14_cpu.7z'''
+        bat """xcopy C:\\mxnet\\data data /E /I /Y
+xcopy C:\\mxnet\\model model /E /I /Y
+call activate py3
+set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+C:\\mxnet\\test_cpu.bat"""
+                        bat """xcopy C:\\mxnet\\data data /E /I /Y
+xcopy C:\\mxnet\\model model /E /I /Y
+call activate py2
+set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
+C:\\mxnet\\test_cpu.bat"""
+      }
+     }
+   },
+   'Python2/3: GPU Win':{
+     node('windows') {
+       ws('workspace/ut-python-gpu') {
+         init_git_win()
+         unstash 'vc14_gpu'
+         bat '''rmdir /s/q pkg_vc14_gpu
+7z x -y vc14_gpu.7z'''
+         bat """xcopy C:\\mxnet\\data data /E /I /Y
+xcopy C:\\mxnet\\model model /E /I /Y
+call activate py3
+set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+C:\\mxnet\\test_gpu.bat"""
+         bat """xcopy C:\\mxnet\\data data /E /I /Y
+xcopy C:\\mxnet\\model model /E /I /Y
+call activate py2
+set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
+C:\\mxnet\\test_gpu.bat"""
+       }
+     }
+   }
+}
+
+
+stage('Integration Test') {
+  parallel 'Python': {
+    node('GPU' && 'linux') {
+      ws('workspace/it-python-gpu') {
+        init_git()
+        unpack_lib('gpu')
+        timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} gpu PYTHONPATH=./python/ python example/image-classification/test_score.py"
+        }
+      }
+    }
+  },
+  'Caffe': {
+    node('GPU' && 'linux') {
+      ws('workspace/it-caffe') {
+        init_git()
+        unpack_lib('gpu')
+        timeout(time: max_time, unit: 'MINUTES') {
+          sh "${docker_run} caffe_gpu PYTHONPATH=/caffe/python:./python python tools/caffe_converter/test_converter.py"
+        }
+      }
+    }
+  }
+}
diff --git a/Makefile b/Makefile
old mode 100755
new mode 100644
index 6af45a9bda12..cad58d974494
--- a/Makefile
+++ b/Makefile
@@ -22,21 +22,21 @@ ifneq ($(USE_OPENMP), 1)
 	export NO_OPENMP = 1
 endif
 
-
 # use customized config file
 include $(config)
 
 ifeq ($(USE_MKL2017), 1)
-	RETURN_STRING=$(shell ./prepare_mkl.sh $(MKLML_ROOT))
-	MKLROOT=$(firstword $(RETURN_STRING))
-	export USE_MKLML=$(lastword $(RETURN_STRING))
+# must run ./prepare_mkl before including mshadow.mk
+	RETURN_STRING = $(shell ./prepare_mkl.sh $(MKLML_ROOT))
+	MKLROOT = $(firstword $(RETURN_STRING))
+	export USE_MKLML = $(lastword $(RETURN_STRING))
 endif
 
 include mshadow/make/mshadow.mk
 include $(DMLC_CORE)/make/dmlc.mk
 
 # all tge possible warning tread
-WARNFLAGS= -Wall
+WARNFLAGS= -Wall -Wsign-compare
 CFLAGS = -DMSHADOW_FORCE_STREAM $(WARNFLAGS)
 
 ifeq ($(DEV), 1)
@@ -62,6 +62,11 @@ ifeq ($(USE_PROFILER), 1)
 	CFLAGS += -DMXNET_USE_PROFILER=1
 endif
 
+# Caffe Plugin
+ifdef CAFFE_PATH
+  CFLAGS += -DMXNET_USE_CAFFE=1
+endif
+
 ifndef LINT_LANG
 	LINT_LANG="all"
 endif
@@ -69,7 +74,7 @@ endif
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
 	CFLAGS += -DMXNET_USE_OPENCV=1 $(shell pkg-config --cflags opencv)
-	LDFLAGS += $(shell pkg-config --libs opencv)
+	LDFLAGS += $(filter-out -lopencv_ts, $(shell pkg-config --libs opencv))
 	BIN += bin/im2rec
 else
 	CFLAGS+= -DMXNET_USE_OPENCV=0
@@ -88,6 +93,8 @@ ifeq ($(USE_MKL2017), 1)
 	CFLAGS += -DMXNET_USE_MKL2017=1
 	CFLAGS += -DUSE_MKL=1
 	CFLAGS += -I$(ROOTDIR)/src/operator/mkl/
+	CFLAGS += -I$(MKLML_ROOT)/include
+	LDFLAGS += -L$(MKLML_ROOT)/lib
 ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
 	CFLAGS += -DMKL_EXPERIMENTAL=1
 else
@@ -100,6 +107,8 @@ ifeq ($(USE_CUDNN), 1)
 	LDFLAGS += -lcudnn
 endif
 
+
+
 ifeq ($(USE_THREADED_ENGINE), 1)
 	CFLAGS += -DMXNET_USE_THREADED_ENGINE
 endif
@@ -126,10 +135,10 @@ ifeq ($(USE_DIST_KVSTORE), 1)
 	LDFLAGS += $(PS_LDFLAGS_A)
 endif
 
-.PHONY: clean all test lint doc clean_all rcpplint rcppexport roxygen\
+.PHONY: clean all extra-packages test lint doc clean_all rcpplint rcppexport roxygen\
 	cython2 cython3 cython cyclean
 
-all: lib/libmxnet.a lib/libmxnet.so $(BIN)
+all: lib/libmxnet.a lib/libmxnet.so $(BIN) extra-packages
 
 SRC = $(wildcard src/*/*/*.cc src/*/*.cc src/*.cc)
 OBJ = $(patsubst %.cc, build/%.o, $(SRC))
@@ -178,7 +187,7 @@ ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(PLUGIN_OBJ) $(LIB_DEP)
 ifeq ($(USE_CUDA), 1)
 	CFLAGS += -I$(ROOTDIR)/cub
 	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) $(PLUGIN_CUOBJ)
-	LDFLAGS += -lcuda
+	LDFLAGS += -lcuda -lcufft
 	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-gpu
 else
 	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-cpu
@@ -255,12 +264,25 @@ $(BIN) :
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -std=c++11  -o $@ $(filter %.cpp %.o %.c %.a %.cc, $^) $(LDFLAGS)
 
+# CPP Package
+ifeq ($(USE_CPP_PACKAGE), 1)
+include cpp-package/cpp-package.mk
+endif
+
 include tests/cpp/unittest.mk
 
+extra-packages: $(EXTRA_PACKAGES)
+
 test: $(TEST)
 
-lint: rcpplint jnilint
-	python2 dmlc-core/scripts/lint.py mxnet ${LINT_LANG} include src plugin scripts python predict/python
+lint: cpplint rcpplint jnilint pylint
+
+cpplint:
+	python2 dmlc-core/scripts/lint.py mxnet cpp include src plugin cpp-package
+
+pylint:
+# ideally we want to check all, such as: python tools example tests
+	pylint python/mxnet --rcfile=$(ROOTDIR)/tests/ci_build/pylintrc
 
 doc: doxygen
 
@@ -307,31 +329,32 @@ scalapkg:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn clean package -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP)")
+			-Dcurrent_libdir="$(ROOTDIR)/lib" \
+			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
 
 scalatest:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn verify -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP)" $(SCALA_TEST_ARGS))
+			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a" $(SCALA_TEST_ARGS))
 
 scalainstall:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn install -P$(SCALA_PKG_PROFILE) -DskipTests -Dcxx="$(CXX)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP)")
+			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
 
 scaladeploy:
 	(cd $(ROOTDIR)/scala-package; \
 		mvn deploy -Prelease,$(SCALA_PKG_PROFILE) -DskipTests -Dcxx="$(CXX)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
-			-Dlddeps="$(LIB_DEP)")
+			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
 
 jnilint:
 	python2 dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src
 
 ifneq ($(EXTRA_OPERATORS),)
-clean: cyclean
+clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
 		R-package/inst R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
@@ -340,7 +363,7 @@ clean: cyclean
 	$(RM) -r  $(patsubst %, %/*.d, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.d, $(EXTRA_OPERATORS))
 	$(RM) -r  $(patsubst %, %/*.o, $(EXTRA_OPERATORS)) $(patsubst %, %/*/*.o, $(EXTRA_OPERATORS))
 else
-clean: cyclean
+clean: cyclean $(EXTRA_PACKAGES_CLEAN)
 	$(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \
 		R-package/inst R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz
 	cd $(DMLC_CORE); $(MAKE) clean; cd -
diff --git a/NEWS.md b/NEWS.md
index 3bc76381bdfe..f29119be897e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,13 +3,13 @@ MXNet Change Log
 ## 0.9.3
 - Move symbolic API to NNVM @tqchen
   - Most front-end C API are backward  compatible
-  - Removed symbolic api in MXNet and relies on NNVM
+  - Removed symbolic API in MXNet and relies on NNVM
 - New features:
-  - MXNet profiler for profiling operator level executions
+  - MXNet profiler for profiling operator-level executions
   - mxnet.image package for fast image loading and processing
 - Change of JSON format
   - param and attr field are merged to attr
-  - New code is backward compatible can load old json format
+  - New code is backward-compatible can load old json format
 - OpProperty registration now is deprecated
   - New operators are encouraged to register their property to NNVM op registry attribute
 - Known features removed limitations to be fixed
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 895d7d65127d..1ad56e33daa8 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet
-Version: 0.9.4
+Version: 0.9.5
 Date: 2015-12-23
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@umail.iu.edu>
diff --git a/README.md b/README.md
index fb9809023911..62117f1eb6ff 100644
--- a/README.md
+++ b/README.md
@@ -8,15 +8,15 @@
 ![banner](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/banner.png)
 
 MXNet is a deep learning framework designed for both *efficiency* and *flexibility*.
-It allows you to ***mix*** the [flavours](http://mxnet.io/architecture/index.html#deep-learning-system-design-concepts) of symbolic
-programming and imperative programming to ***maximize*** efficiency and productivity.
-In its core, a dynamic dependency scheduler that automatically parallelizes both symbolic and imperative operations on the fly.
+It allows you to ***mix*** [symbolic and imperative programming](http://mxnet.io/architecture/index.html#deep-learning-system-design-concepts)
+to ***maximize*** efficiency and productivity.
+At its core, MXNet contains a dynamic dependency scheduler that automatically parallelizes both symbolic and imperative operations on the fly.
 A graph optimization layer on top of that makes symbolic execution fast and memory efficient.
-The library is portable and lightweight, and it scales to multiple GPUs and multiple machines.
+MXNet is portable and lightweight, scaling effectively to multiple GPUs and multiple machines.
 
 MXNet is also more than a deep learning project. It is also a collection of
 [blue prints and guidelines](http://mxnet.io/architecture/index.html#deep-learning-system-design-concepts) for building
-deep learning system, and interesting insights of DL systems for hackers.
+deep learning systems, and interesting insights of DL systems for hackers.
 
 [![Join the chat at https://gitter.im/dmlc/mxnet](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/mxnet?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
@@ -55,15 +55,15 @@ Features
 --------
 * Design notes providing useful insights that can re-used by other DL projects
 * Flexible configuration for arbitrary computation graph
-* Mix and match good flavours of programming to maximize flexibility and efficiency
+* Mix and match imperative and symbolic programming to maximize flexibility and efficiency
 * Lightweight, memory efficient and portable to smart devices
 * Scales up to multi GPUs and distributed setting with auto parallelism
-* Support for python, R, C++ and Julia
+* Support for Python, R, C++ and Julia
 * Cloud-friendly and directly compatible with S3, HDFS, and Azure
 
 Ask Questions
 -------------
-* Please use [mxnet/issues](https://github.com/dmlc/mxnet/issues) for how to use mxnet and reporting bugs 
+* Please use [mxnet/issues](https://github.com/dmlc/mxnet/issues) for how to use mxnet and reporting bugs
 
 License
 -------
@@ -79,4 +79,4 @@ In Neural Information Processing Systems, Workshop on Machine Learning Systems,
 
 History
 -------
-MXNet is initiated and designed in collaboration by the authors of [cxxnet](https://github.com/dmlc/cxxnet), [minerva](https://github.com/dmlc/minerva) and [purine2](https://github.com/purine/purine2). The project reflects what we have learnt from the past projects. It combines important flavours of the existing projects for efficiency, flexibility and memory efficiency.
+MXNet emerged from a collaboration by the authors of [cxxnet](https://github.com/dmlc/cxxnet), [minerva](https://github.com/dmlc/minerva), and [purine2](https://github.com/purine/purine2). The project reflects what we have learned from the past projects. MXNet combines aspects of each of these projects to achieve flexibility, speed, and memory efficiency.
diff --git a/amalgamation/Makefile b/amalgamation/Makefile
index 37c88f17a51c..2446667c1e9e 100644
--- a/amalgamation/Makefile
+++ b/amalgamation/Makefile
@@ -1,6 +1,9 @@
 export MXNET_ROOT=`pwd`/..
-# Change this to path of openblas
-export OPENBLAS_ROOT=/usr/local/opt/openblas
+
+# Change this to path or specify in make command
+ifndef OPENBLAS_ROOT
+    export OPENBLAS_ROOT=/usr/local/opt/openblas
+endif
 
 # Whether use minimum build without blas and SSE, this will make the library super slow
 ifndef MIN
@@ -16,15 +19,32 @@ else
     DEFS+=-DMSHADOW_USE_SSE=0
 endif
 
+# Use locally installed emscripten if not specified
+ifndef EMCC
+    EMCC=emcc
+endif
+
+ifndef DISABLE_OPENMP
+       DEFS+=-DDISABLE_OPENMP=1
+endif
 
 .PHONY: all clean
 
 DEFS+=-DMSHADOW_USE_CUDA=0 -DMSHADOW_USE_MKL=0 -DMSHADOW_RABIT_PS=0 -DMSHADOW_DIST_PS=0 -DDMLC_LOG_STACK_TRACE=0
-DEFS+=-DMSHADOW_FORCE_STREAM -DMXNET_USE_OPENCV=0 -DMXNET_PREDICT_ONLY=1 -DDISABLE_OPENMP=1
+DEFS+=-DMSHADOW_FORCE_STREAM -DMXNET_USE_OPENCV=0 -DMXNET_PREDICT_ONLY=1
 CFLAGS=-std=c++11 -Wno-unknown-pragmas -Wall $(DEFS)
 ifneq ($(MIN), 1)
 	CFLAGS += -I${OPENBLAS_ROOT} -I${OPENBLAS_ROOT}/include
-	LDFLAGS+= -L${OPENBLAS_ROOT} -L${OPENBLAS_ROOT}/lib -lopenblas
+	LDFLAGS+= -L${OPENBLAS_ROOT} -L${OPENBLAS_ROOT}/lib
+        
+	# Define which blas is installed. Uses OpenBLAS by default.
+	ifeq ($(USE_BLAS), atlas)
+                LDFLAGS += -lcblas
+        else ifeq ($(USE_BLAS), blas)
+                LDFLAGS += -lblas
+        else
+		LDFLAGS += -lopenblas
+	endif
 endif
 
 
@@ -58,7 +78,7 @@ mxnet_predict-all.o: mxnet_predict-all.cc
 libmxnet_predict.a: mxnet_predict-all.o
 	ar rcs libmxnet_predict.a $+
 
-jni_libmxnet_predict.o: mxnet_predict-all.cc
+jni_libmxnet_predict.o: mxnet_predict-all.cc jni/predictor.cc 
 	${CXX} ${CFLAGS} -fPIC -o $@ -c jni/predictor.cc
 
 jni_libmxnet_predict.so: jni_libmxnet_predict.o
@@ -73,15 +93,23 @@ else
 endif
 
 libmxnet_predict.js: mxnet_predict-all.cc
-	emcc -std=c++11 -O2 $(DEFS) -DMSHADOW_USE_SSE=0 -D__MXNET_JS__  -o $@ $+ \
-	-s EXPORTED_FUNCTIONS="['_MXPredCreate', '_MXPredGetOutputShape', '_MXPredSetInput', '_MXPredForward', '_MXPredPartialForward', '_MXPredGetOutput', '_MXPredFree', '_MXNDListCreate', '_MXNDListGet', '_MXNDListFree']" \
+	${EMCC} -std=c++11 -O2 $(DEFS) -DMSHADOW_USE_SSE=0 -D__MXNET_JS__  -o $@ $+ \
+	-s EXPORTED_FUNCTIONS="['_MXPredCreate', \
+	                        '_MXPredGetOutputShape', \
+	                        '_MXPredSetInput', \
+	                        '_MXPredForward', \
+	                        '_MXPredPartialForward', \
+	                        '_MXPredGetOutput', \
+	                        '_MXPredFree', \
+	                        '_MXNDListCreate', \
+	                        '_MXNDListGet', \
+	                        '_MXNDListFree']" \
 	-s ALLOW_MEMORY_GROWTH=1
 
-
 ${MXNET_ROOT}/lib/libmxnet_predict.so:  mxnet_predict-all.o
 	@mkdir -p ${MXNET_ROOT}/lib
 	${CXX} ${CFLAGS} -shared -o $@ $(filter %.o %.a, $^) $(LDFLAGS)
 	ls -alh $@
 
 clean:
-	rm -f *.d *.o *.so *.a mxnet_predict-all.cc nnvm.cc
+	rm -f *.d *.o *.so *.a *.js *.js.mem mxnet_predict-all.cc nnvm.cc
diff --git a/amalgamation/README.md b/amalgamation/README.md
index 5c6522c55eaa..3faf4d1e18a4 100644
--- a/amalgamation/README.md
+++ b/amalgamation/README.md
@@ -45,10 +45,15 @@ You can use generated library in [Leliana WhatsThis Android app](https://github.
 
 Javascript
 ---------------
-JS version works without OpenBLAS. You need [emscripten](http://kripken.github.io/emscripten-site/) to build it.
-Type ```make MIN=1 libmxnet_predict.js```
+JS version uses [emscripten](http://kripken.github.io/emscripten-site/) to cross-compile the amalgamation source file into a Javascript library that can be integrated into client side applications.  If you already have emanscripten installed then 
 
-You can use generated library in [mxnet.js](https://github.com/dmlc/mxnet.js)
+```make clean libmxnet_predict.js MIN=1```
+
+otherwise you can use [emscripten docker image](https://hub.docker.com/r/apiaryio/emcc/) to compile in the following way
+
+```make clean libmxnet_predict.js MIN=1 EMCC="docker run -v ${PWD}:/src apiaryio/emcc emcc"```
+
+An example WebApp that uses the generated JS library can be found at [mxnet.js](https://github.com/dmlc/mxnet.js)
 
 iOS
 ---------------
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 88fda297f475..da3b60ac8399 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -8,7 +8,7 @@
     'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
     'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
     'nvml.h', 'opencv2/opencv.hpp', 'sys/stat.h', 'sys/types.h', 'cuda.h', 'cuda_fp16.h',
-    'omp.h', 'execinfo.h', 'packet/sse-inl.h'
+    'omp.h', 'execinfo.h', 'packet/sse-inl.h', 'emmintrin.h', 'thrust/device_vector.h'
     ]
 
 minimum = int(sys.argv[6]) if len(sys.argv) > 5 else 0
@@ -65,6 +65,8 @@ def find_source(name, start, stage):
 out = StringIO.StringIO()
 
 
+
+
 def expand(x, pending, stage):
     if x in history and x not in ['mshadow/mshadow/expr_scalar-inl.h']: # MULTIPLE includes
         return
@@ -73,7 +75,10 @@ def expand(x, pending, stage):
         #print 'loop found: %s in ' % x, pending
         return
 
-    print >>out, "//===== EXPANDING: %s =====\n" %x
+    whtspace = '  '*expand.treeDepth
+    expand.fileCount+=1
+    print >>out, "//=====[%3d] STAGE:%4s %sEXPANDING: %s =====\n" % (expand.fileCount, stage, whtspace, x)
+    print        "//=====[%3d] STAGE:%4s %sEXPANDING: %s        " % (expand.fileCount, stage, whtspace, x)
     for line in open(x):
         if line.find('#include') < 0:
             out.write(line)
@@ -95,19 +100,28 @@ def expand(x, pending, stage):
                 'nnpack' not in h and
                 not h.endswith('.cuh')): sysheaders.append(h)
         else:
+            expand.treeDepth+=1
             expand(source, pending + [x], stage)
-    print >>out, "//===== EXPANDED: %s =====\n" %x
+            expand.treeDepth-=1
+    print >>out, "//===== EXPANDED  : %s =====\n" %x
     history.add(x)
 
+
+# Vars to keep track of number of files expanded.
+# Used in printing informative comments.
+expand.treeDepth = 0
+expand.fileCount = 0
+
+# Expand the stages
 expand(sys.argv[2], [], "dmlc")
 expand(sys.argv[3], [], "nnvm")
 expand(sys.argv[4], [], "src")
 
-
-
+# Write to amalgamation file
 f = open(sys.argv[5], 'wb')
 
 if minimum != 0:
+    sysheaders.remove('cblas.h')
     print >>f, "#define MSHADOW_STAND_ALONE 1"
     print >>f, "#define MSHADOW_USE_SSE 0"
     print >>f, "#define MSHADOW_USE_CBLAS 0"
diff --git a/amalgamation/jni/predictor.cc b/amalgamation/jni/predictor.cc
index 6ee9547b34a5..2687d1d9d93e 100644
--- a/amalgamation/jni/predictor.cc
+++ b/amalgamation/jni/predictor.cc
@@ -19,14 +19,17 @@ JNIEXPORT jlong JNICALL Java_org_dmlc_mxnet_Predictor_createPredictor
 		track.emplace_back(js, s);
     }
 
-	std::vector<mx_uint> index{0};
+	std::vector<mx_uint> index;
 	std::vector<mx_uint> shapes;
+    mx_uint prev = 0;
+    index.emplace_back(prev);
     for (int i=0; i<env->GetArrayLength(jshapes); i++) {
         jintArray jshape = (jintArray) env->GetObjectArrayElement(jshapes, i);
 		jsize shape_len = env->GetArrayLength(jshape);
 		jint *shape = env->GetIntArrayElements(jshape, 0);
 
-		index.emplace_back(shape_len);
+        prev += shape_len;
+		index.emplace_back(prev);
 		for (int j=0; j<shape_len; ++j) shapes.emplace_back((mx_uint)shape[j]);
 		env->ReleaseIntArrayElements(jshape, shape, 0);
     }
diff --git a/cmake/Modules/FindJeMalloc.cmake b/cmake/Modules/FindJeMalloc.cmake
new file mode 100644
index 000000000000..8b965cf6c3bb
--- /dev/null
+++ b/cmake/Modules/FindJeMalloc.cmake
@@ -0,0 +1,45 @@
+
+# Copyright (c)      2014 Thomas Heller
+# Copyright (c) 2007-2012 Hartmut Kaiser
+# Copyright (c) 2010-2011 Matt Anderson
+# Copyright (c) 2011      Bryce Lelbach
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+find_package(PkgConfig)
+pkg_check_modules(PC_JEMALLOC QUIET jemalloc)
+
+find_path(JEMALLOC_INCLUDE_DIR jemalloc/jemalloc.h
+  HINTS
+    ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT
+    ${PC_JEMALLOC_MINIMAL_INCLUDEDIR}
+    ${PC_JEMALLOC_MINIMAL_INCLUDE_DIRS}
+    ${PC_JEMALLOC_INCLUDEDIR}
+    ${PC_JEMALLOC_INCLUDE_DIRS}
+  PATH_SUFFIXES include)
+
+find_library(JEMALLOC_LIBRARY NAMES jemalloc libjemalloc
+  HINTS
+    ${JEMALLOC_ROOT} ENV JEMALLOC_ROOT
+    ${PC_JEMALLOC_MINIMAL_LIBDIR}
+    ${PC_JEMALLOC_MINIMAL_LIBRARY_DIRS}
+    ${PC_JEMALLOC_LIBDIR}
+    ${PC_JEMALLOC_LIBRARY_DIRS}
+  PATH_SUFFIXES lib lib64)
+
+set(JEMALLOC_LIBRARIES ${JEMALLOC_LIBRARY})
+set(JEMALLOC_INCLUDE_DIRS ${JEMALLOC_INCLUDE_DIR})
+
+find_package_handle_standard_args(Jemalloc DEFAULT_MSG
+  JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
+
+get_property(_type CACHE JEMALLOC_ROOT PROPERTY TYPE)
+if(_type)
+  set_property(CACHE JEMALLOC_ROOT PROPERTY ADVANCED 1)
+  if("x${_type}" STREQUAL "xUNINITIALIZED")
+    set_property(CACHE JEMALLOC_ROOT PROPERTY TYPE PATH)
+  endif()
+endif()
+
+mark_as_advanced(JEMALLOC_ROOT JEMALLOC_LIBRARY JEMALLOC_INCLUDE_DIR)
diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake
index b3cf9e279663..9679f3d72e60 100644
--- a/cmake/Modules/FindMKL.cmake
+++ b/cmake/Modules/FindMKL.cmake
@@ -2,6 +2,8 @@
 #
 # Options:
 #
+#   USE_MKLML_MKL                   : Search for MKL:ML library variant
+#
 #   MKL_USE_SINGLE_DYNAMIC_LIBRARY  : use single dynamic library interface
 #   MKL_USE_STATIC_LIBS             : use static libraries
 #   MKL_MULTI_THREADED              : use multi-threading
@@ -13,110 +15,168 @@
 #   MKL_FOUND            : True mkl is found
 #   MKL_INCLUDE_DIR      : unclude directory
 #   MKL_LIBRARIES        : the libraries to link against.
+#
+# cjolivier01: Changed to also look for MKLML library (subset of mkl) instead of standard MKL package
+#
 
-
-# ---[ Options
-mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
-mxnet_option(MKL_USE_STATIC_LIBS "Use static libraries" OFF IF NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
-mxnet_option(MKL_MULTI_THREADED  "Use multi-threading"   ON IF NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
-mxnet_option(MKL_USE_ILP64  "Use ilp64 data model" OFF)
-mxnet_option(MKL_USE_CLUSTER "Use cluster functions" OFF IF CMAKE_SIZEOF_VOID_P EQUAL 4)
+if(MKL_FOUND)
+  return()
+endif()
 
 # ---[ Root folders
 set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-find_path(MKL_ROOT include/mkl.h PATHS $ENV{MKL_ROOT} ${INTEL_ROOT}/mkl
-                                   DOC "Folder contains MKL")
-
-# ---[ Find include dir
-find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
-set(__looked_for MKL_INCLUDE_DIR)
-
-# ---[ Find libraries
-if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-  set(__path_suffixes lib lib/ia32)
-else()
-  set(__path_suffixes lib lib/intel64)
-endif()
 
-set(__mkl_libs "")
-if(MKL_USE_SINGLE_DYNAMIC_LIBRARY)
-  list(APPEND __mkl_libs rt)
-else()
+if(USE_MKLML_MKL)
+
+  find_path(MKL_ROOT include/mkl_blas.h
+    PATHS $ENV{MKL_ROOT}
+    ${INTEL_ROOT}/mklml
+    ${DIRECT_DEPENDENCY_ROOTS}
+    DOC "Folder contains MKL"
+    )
+
+  # ---[ Find include dir
+  find_path(MKL_INCLUDE_DIR mkl_blas.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
+  set(__looked_for MKL_INCLUDE_DIR)
+
+  # ---[ Find libraries
   if(CMAKE_SIZEOF_VOID_P EQUAL 4)
-    if(WIN32)
-      list(APPEND __mkl_libs intel_c)
-    else()
-      list(APPEND __mkl_libs intel)
-      if(CMAKE_COMPILER_IS_GNUFORTRAN)
-        list(APPEND __mkl_libs gf)
-      endif()
-    endif()
+    set(__path_suffixes lib lib/ia32)
   else()
-    set(__mkl_lib64_suffix "lp64")
-    if(MKL_USE_ILP64)
-      set(__mkl_lib64_suffix "ilp64")
-      add_definitions(-DMKL_ILP64)
-    endif()
-    list(APPEND __mkl_libs "intel_${__mkl_lib64_suffix}")
-    if(CMAKE_COMPILER_IS_GNUFORTRAN)
-    	list(APPEND __mkl_libs "gf_${__mkl_lib64_suffix}")
-    endif()
+    set(__path_suffixes lib lib/intel64)
   endif()
 
-  if(MKL_MULTI_THREADED)
-    list(APPEND __mkl_libs intel_thread)
+  set(__mkl_libs "")
+
+  if(WIN32)
+    list(APPEND __mkl_libs intel)
   else()
-     list(APPEND __mkl_libs sequential)
+    list(APPEND __mkl_libs gnu)
   endif()
 
-  list(APPEND __mkl_libs core)
-  if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MKL_USE_CLUSTER)
-    list(APPEND __mkl_libs cdft_core)
-  endif()
-endif()
+  foreach (__lib ${__mkl_libs})
+    set(__mkl_lib "mklml_${__lib}")
+    string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
 
+    if(MKL_USE_STATIC_LIBS)
+      set(__mkl_lib "lib${__mkl_lib}.a")
+    endif()
 
-foreach (__lib ${__mkl_libs})
-  set(__mkl_lib "mkl_${__lib}")
-  string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
+    find_library(${__mkl_lib_upper}_LIBRARY
+      NAMES ${__mkl_lib}
+      PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.."
+      PATH_SUFFIXES ${__path_suffixes}
+      DOC "The path to Intel(R) MKL ${__mkl_lib} library")
+    mark_as_advanced(${__mkl_lib_upper}_LIBRARY)
 
-  if(MKL_USE_STATIC_LIBS)
-    set(__mkl_lib "lib${__mkl_lib}.a")
-  endif()
+    list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY)
+    list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
+  endforeach()
+
+
+else(USE_MKLML_MKL)
 
-  find_library(${__mkl_lib_upper}_LIBRARY
-        NAMES ${__mkl_lib}
-        PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.."
-        PATH_SUFFIXES ${__path_suffixes}
-        DOC "The path to Intel(R) MKL ${__mkl_lib} library")
-  mark_as_advanced(${__mkl_lib_upper}_LIBRARY)
+  # ---[ Options
+  mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON)
+  mxnet_option(MKL_USE_STATIC_LIBS "Use static libraries" OFF IF NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
+  mxnet_option(MKL_MULTI_THREADED  "Use multi-threading"   ON IF NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
+  mxnet_option(MKL_USE_ILP64  "Use ilp64 data model" OFF)
+  mxnet_option(MKL_USE_CLUSTER "Use cluster functions" OFF IF CMAKE_SIZEOF_VOID_P EQUAL 4)
 
-  list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY)
-  list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
-endforeach()
+  find_path(MKL_ROOT include/mkl.h PATHS $ENV{MKL_ROOT} ${INTEL_ROOT}/mkl
+    DOC "Folder contains MKL")
 
+  # ---[ Find include dir
+  find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include)
+  set(__looked_for MKL_INCLUDE_DIR)
 
-if(NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
-  if (MKL_USE_STATIC_LIBS)
-    set(__iomp5_libs iomp5 libiomp5mt.lib)
+  # ---[ Find libraries
+  if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    set(__path_suffixes lib lib/ia32)
   else()
-    set(__iomp5_libs iomp5 libiomp5md.lib)
+    set(__path_suffixes lib lib/intel64)
   endif()
 
-  if(WIN32)
-    find_path(INTEL_INCLUDE_DIR omp.h PATHS ${INTEL_ROOT} PATH_SUFFIXES include)
-    list(APPEND __looked_for INTEL_INCLUDE_DIR)
+  set(__mkl_libs "")
+  if(MKL_USE_SINGLE_DYNAMIC_LIBRARY)
+    list(APPEND __mkl_libs rt)
+  else()
+    if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+      if(WIN32)
+        list(APPEND __mkl_libs intel_c)
+      else()
+        list(APPEND __mkl_libs intel)
+        if(CMAKE_COMPILER_IS_GNUFORTRAN)
+          list(APPEND __mkl_libs gf)
+        endif()
+      endif()
+    else()
+      set(__mkl_lib64_suffix "lp64")
+      if(MKL_USE_ILP64)
+        set(__mkl_lib64_suffix "ilp64")
+        add_definitions(-DMKL_ILP64)
+      endif()
+      list(APPEND __mkl_libs "intel_${__mkl_lib64_suffix}")
+      if(CMAKE_COMPILER_IS_GNUFORTRAN)
+        list(APPEND __mkl_libs "gf_${__mkl_lib64_suffix}")
+      endif()
+    endif()
+
+    if(MKL_MULTI_THREADED)
+      list(APPEND __mkl_libs intel_thread)
+    else()
+      list(APPEND __mkl_libs sequential)
+    endif()
+
+    list(APPEND __mkl_libs core)
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8 AND MKL_USE_CLUSTER)
+      list(APPEND __mkl_libs cdft_core)
+    endif()
   endif()
 
-  find_library(MKL_RTL_LIBRARY ${__iomp5_libs}
-     PATHS ${INTEL_RTL_ROOT} ${INTEL_ROOT}/compiler ${MKL_ROOT}/.. ${MKL_ROOT}/../compiler
-     PATH_SUFFIXES ${__path_suffixes}
-     DOC "Path to Path to OpenMP runtime library")
 
-  list(APPEND __looked_for MKL_RTL_LIBRARY)
-  list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
-endif()
+  foreach (__lib ${__mkl_libs})
+    set(__mkl_lib "mkl_${__lib}")
+    string(TOUPPER ${__mkl_lib} __mkl_lib_upper)
+
+    if(MKL_USE_STATIC_LIBS)
+      set(__mkl_lib "lib${__mkl_lib}.a")
+    endif()
 
+    find_library(${__mkl_lib_upper}_LIBRARY
+      NAMES ${__mkl_lib}
+      PATHS ${MKL_ROOT} "${MKL_INCLUDE_DIR}/.."
+      PATH_SUFFIXES ${__path_suffixes}
+      DOC "The path to Intel(R) MKL ${__mkl_lib} library")
+    mark_as_advanced(${__mkl_lib_upper}_LIBRARY)
+
+    list(APPEND __looked_for ${__mkl_lib_upper}_LIBRARY)
+    list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY})
+  endforeach()
+
+
+  if(NOT MKL_USE_SINGLE_DYNAMIC_LIBRARY)
+    if (MKL_USE_STATIC_LIBS)
+      set(__iomp5_libs iomp5 libiomp5mt.lib)
+    else()
+      set(__iomp5_libs iomp5 libiomp5md.lib)
+    endif()
+
+    if(WIN32)
+      find_path(INTEL_INCLUDE_DIR omp.h PATHS ${INTEL_ROOT} PATH_SUFFIXES include)
+      list(APPEND __looked_for INTEL_INCLUDE_DIR)
+    endif()
+
+    find_library(MKL_RTL_LIBRARY ${__iomp5_libs}
+      PATHS ${INTEL_RTL_ROOT} ${INTEL_ROOT}/compiler ${MKL_ROOT}/.. ${MKL_ROOT}/../compiler
+      PATH_SUFFIXES ${__path_suffixes}
+      DOC "Path to Path to OpenMP runtime library")
+
+    list(APPEND __looked_for MKL_RTL_LIBRARY)
+    list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY})
+  endif()
+
+endif(USE_MKLML_MKL)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for})
@@ -126,3 +186,4 @@ if(MKL_FOUND)
 endif()
 
 mxnet_clear_vars(__looked_for __mkl_libs __path_suffixes __lib_suffix __iomp5_libs)
+
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
index 67a130cb4bc3..a0c96c3f02f7 100644
--- a/cmake/Modules/FindOpenBLAS.cmake
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -1,3 +1,6 @@
+if(MKL_FOUND)
+  message(ERROR " OpenBLAS is not required since MKL is enabled")
+endif()
 file(TO_CMAKE_PATH "$ENV{OpenBLAS_HOME}" OpenBLAS_HOME)
 file(TO_CMAKE_PATH "$ENV{OpenBLAS}" OpenBLAS_DIR)
 
diff --git a/cpp-package/.travis.yml b/cpp-package/.travis.yml
new file mode 100644
index 000000000000..e7a332d09125
--- /dev/null
+++ b/cpp-package/.travis.yml
@@ -0,0 +1,48 @@
+sudo: false
+
+language: cpp
+
+os:
+  - linux
+# disable for now since clang doesn't support openmp
+#  - osx
+
+env:
+  # code analysis
+  - TASK=lint
+  # TODO: build example
+  - TASK=build
+
+# dependent apt packages
+addons:
+  apt:
+    sources:
+      - ubuntu-toolchain-r-test
+    packages:
+      - gcc-4.8
+      - g++-4.8
+#      - wget
+#      - git
+#      - libcurl4-openssl-dev
+#      - unzip
+#      - libatlas-dev
+#      - libopencv-dev
+
+before_install:
+
+install:
+  - source tests/travis/setup.sh
+
+script:
+  - tests/travis/run_test.sh
+
+cache:
+  directories:
+    - ${HOME}/.cache/usr
+
+notifications:
+# Emails are sent to the committer's git-configured email address by default,
+  email:
+    on_success: change
+    on_failure: always
+  #slack: dmlc:NmroCzntCiWOuxUZpii40USd
diff --git a/cpp-package/CMakeLists.txt b/cpp-package/CMakeLists.txt
new file mode 100644
index 000000000000..2cc322bdd2a4
--- /dev/null
+++ b/cpp-package/CMakeLists.txt
@@ -0,0 +1,18 @@
+
+if(USE_CPP_PACKAGE AND NOT MSVC)
+
+set(CPP_PACKAGE_OP_H_HEADER ${CMAKE_CURRENT_LIST_DIR}/include/mxnet-cpp/op.h)
+
+add_custom_target(
+  cpp_package_op_h ALL
+  BYPRODUCTS ${CPP_PACKAGE_OP_H_HEADER}
+  MAIN_DEPENDENCY mxnet
+  DEPENDS mxnet ${CMAKE_SOURCE_DIR}/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
+  COMMAND echo "Running: OpWrapperGenerator.py"
+  COMMAND python OpWrapperGenerator.py ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/$<TARGET_FILE:mxnet>
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/cpp-package/src/OpWrapperGenerator/
+)
+
+add_subdirectory(example)
+
+endif()
\ No newline at end of file
diff --git a/cpp-package/LICENSE b/cpp-package/LICENSE
new file mode 100644
index 000000000000..2525650c621b
--- /dev/null
+++ b/cpp-package/LICENSE
@@ -0,0 +1,13 @@
+Copyright (c) 2015 by Contributors 
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/cpp-package/README.md b/cpp-package/README.md
new file mode 100644
index 000000000000..dcfcbc81f3a7
--- /dev/null
+++ b/cpp-package/README.md
@@ -0,0 +1,8 @@
+# MxNet C++ Package
+
+[![Build Status](https://travis-ci.org/dmlc/MXNet.cpp.svg?branch=master)](https://travis-ci.org/dmlc/MXNet.cpp)
+[![Build status](https://ci.appveyor.com/api/projects/status/ckfq6j53sg5ll01d/branch/master?svg=true)](https://ci.appveyor.com/project/lx75249/mxnet-cpp/branch/master)
+
+The examples dir containers examples for you to get started.
+The lib dir should contain the compiled mxnet library.
+Windows dir contains Visual C++ solution files and project files.
diff --git a/cpp-package/cpp-package.mk b/cpp-package/cpp-package.mk
new file mode 100644
index 000000000000..54f4cce4fa7f
--- /dev/null
+++ b/cpp-package/cpp-package.mk
@@ -0,0 +1,28 @@
+ifndef LINT_LANG
+	LINT_LANG="all"
+endif
+
+ifdef CAFFE_PATH
+export LD_LIBRARY_PATH=$(CAFFE_PATH)/lib
+endif
+
+CPP_PACKAGE_OP_H_FILE = cpp-package/include/mxnet-cpp/op.h
+
+EXTRA_PACKAGES += cpp-package-all
+EXTRA_PACKAGES_CLEAN += cpp-package-clean
+
+.PHONY: cpp-package-all cpp-package-lint cpp-package-clean
+
+cpp-package-all: $(CPP_PACKAGE_OP_H_FILE)
+
+cpp-package-clean:
+	rm -f $(CPP_PACKAGE_OP_H_FILE)
+
+$(CPP_PACKAGE_OP_H_FILE): lib/libmxnet.so cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
+	(cd cpp-package/src/OpWrapperGenerator; python OpWrapperGenerator.py $(ROOTDIR)/lib/libmxnet.so)
+
+cpp-package-lint:
+	(cd cpp-package; python scripts/lint.py dmlc ${LINT_LANG} include example)
+
+include cpp-package/example/example.mk
+
diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
new file mode 100644
index 000000000000..2f596678415a
--- /dev/null
+++ b/cpp-package/example/CMakeLists.txt
@@ -0,0 +1,54 @@
+
+if(NOT MSVC)
+  set(UNITTEST_STATIC_LINK ON)
+endif()
+
+set(CPP_EXAMPLE_LIBS
+  rt
+  ${BEGIN_WHOLE_ARCHIVE} mxnet_static ${END_WHOLE_ARCHIVE}
+  dmlccore
+  ${mxnet_LINKER_LIBS}
+  )
+
+set(CPP_PACKAGE_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../include/mxnet-cpp/)
+
+set(CPPEX_DEPS cpp_package_op_h)
+
+file(GLOB_RECURSE CPP_PACKAGE_HEADERS
+  "${CPP_PACKAGE_INCLUDE_DIR}/*.h"
+  "${CPP_PACKAGE_INCLUDE_DIR}/*.hpp"
+  )
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
+
+add_executable(lenet lenet.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(lenet ${CPP_EXAMPLE_LIBS})
+add_dependencies(lenet ${CPPEX_DEPS})
+
+add_executable(lenet_with_mxdataiter lenet_with_mxdataiter.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(lenet_with_mxdataiter ${CPP_EXAMPLE_LIBS})
+add_dependencies(lenet_with_mxdataiter ${CPPEX_DEPS})
+
+add_executable(alexnet alexnet.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(alexnet ${CPP_EXAMPLE_LIBS})
+add_dependencies(alexnet ${CPPEX_DEPS})
+
+add_executable(charRNN charRNN.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(charRNN ${CPP_EXAMPLE_LIBS})
+add_dependencies(charRNN ${CPPEX_DEPS})
+
+add_executable(googlenet googlenet.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(googlenet ${CPP_EXAMPLE_LIBS})
+add_dependencies(googlenet ${CPPEX_DEPS})
+
+add_executable(inception_bn inception_bn.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(inception_bn ${CPP_EXAMPLE_LIBS})
+add_dependencies(inception_bn ${CPPEX_DEPS})
+
+add_executable(mlp mlp.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(mlp ${CPP_EXAMPLE_LIBS})
+add_dependencies(mlp ${CPPEX_DEPS})
+
+add_executable(resnet resnet.cpp ${CPP_PACKAGE_HEADERS})
+target_link_libraries(resnet ${CPP_EXAMPLE_LIBS})
+add_dependencies(resnet ${CPPEX_DEPS})
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
new file mode 100644
index 000000000000..c0d8273d559b
--- /dev/null
+++ b/cpp-package/example/alexnet.cpp
@@ -0,0 +1,303 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+Symbol AlexnetSymbol(int num_classes) {
+  auto input_data = Symbol::Variable("data");
+  auto target_label = Symbol::Variable("label");
+  /*stage 1*/
+  auto conv1 = Operator("Convolution")
+                   .SetParam("kernel", Shape(11, 11))
+                   .SetParam("num_filter", 96)
+                   .SetParam("stride", Shape(4, 4))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", input_data)
+                   .CreateSymbol("conv1");
+  auto relu1 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv1)
+                   .CreateSymbol("relu1");
+  auto pool1 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max") /*avg,max,sum */
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu1)
+                   .CreateSymbol("pool1");
+  auto lrn1 = Operator("LRN")
+                  .SetParam("nsize", 5)
+                  .SetParam("alpha", 0.0001)
+                  .SetParam("beta", 0.75)
+                  .SetParam("knorm", 1)
+                  .SetInput("data", pool1)
+                  .CreateSymbol("lrn1");
+  /*stage 2*/
+  auto conv2 = Operator("Convolution")
+                   .SetParam("kernel", Shape(5, 5))
+                   .SetParam("num_filter", 256)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(2, 2))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", lrn1)
+                   .CreateSymbol("conv2");
+  auto relu2 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv2)
+                   .CreateSymbol("relu2");
+  auto pool2 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max") /*avg,max,sum */
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu2)
+                   .CreateSymbol("pool2");
+  auto lrn2 = Operator("LRN")
+                  .SetParam("nsize", 5)
+                  .SetParam("alpha", 0.0001)
+                  .SetParam("beta", 0.75)
+                  .SetParam("knorm", 1)
+                  .SetInput("data", pool2)
+                  .CreateSymbol("lrn2");
+  /*stage 3*/
+  auto conv3 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 384)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", lrn2)
+                   .CreateSymbol("conv3");
+  auto relu3 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv3)
+                   .CreateSymbol("relu3");
+  auto conv4 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 384)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", relu3)
+                   .CreateSymbol("conv4");
+  auto relu4 = Operator("Activation")
+                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
+                   .SetInput("data", conv4)
+                   .CreateSymbol("relu4");
+  auto conv5 = Operator("Convolution")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("num_filter", 256)
+                   .SetParam("stride", Shape(1, 1))
+                   .SetParam("dilate", Shape(1, 1))
+                   .SetParam("pad", Shape(1, 1))
+                   .SetParam("num_group", 1)
+                   .SetParam("workspace", 512)
+                   .SetParam("no_bias", false)
+                   .SetInput("data", relu4)
+                   .CreateSymbol("conv5");
+  auto relu5 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", conv5)
+                   .CreateSymbol("relu5");
+  auto pool3 = Operator("Pooling")
+                   .SetParam("kernel", Shape(3, 3))
+                   .SetParam("pool_type", "max")
+                   .SetParam("global_pool", false)
+                   .SetParam("stride", Shape(2, 2))
+                   .SetParam("pad", Shape(0, 0))
+                   .SetInput("data", relu5)
+                   .CreateSymbol("pool3");
+  /*stage4*/
+  auto flatten =
+      Operator("Flatten").SetInput("data", pool3).CreateSymbol("flatten");
+  auto fc1 = Operator("FullyConnected")
+                 .SetParam("num_hidden", 4096)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", flatten)
+                 .CreateSymbol("fc1");
+  auto relu6 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", fc1)
+                   .CreateSymbol("relu6");
+  auto dropout1 = Operator("Dropout")
+                      .SetParam("p", 0.5)
+                      .SetInput("data", relu6)
+                      .CreateSymbol("dropout1");
+  /*stage5*/
+  auto fc2 = Operator("FullyConnected")
+                 .SetParam("num_hidden", 4096)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", dropout1)
+                 .CreateSymbol("fc2");
+  auto relu7 = Operator("Activation")
+                   .SetParam("act_type", "relu")
+                   .SetInput("data", fc2)
+                   .CreateSymbol("relu7");
+  auto dropout2 = Operator("Dropout")
+                      .SetParam("p", 0.5)
+                      .SetInput("data", relu7)
+                      .CreateSymbol("dropout2");
+  /*stage6*/
+  auto fc3 = Operator("FullyConnected")
+                 .SetParam("num_hidden", num_classes)
+                 .SetParam("no_bias", false)
+                 .SetInput("data", dropout2)
+                 .CreateSymbol("fc3");
+  auto softmax = Operator("SoftmaxOutput")
+                     .SetParam("grad_scale", 1)
+                     .SetParam("ignore_label", -1)
+                     .SetParam("multi_output", false)
+                     .SetParam("use_ignore", false)
+                     .SetParam("normalization", "null") /*batch,null,valid */
+                     .SetInput("data", fc3)
+                     .SetInput("label", target_label)
+                     .CreateSymbol("softmax");
+  return softmax;
+}
+
+int main(int argc, char const *argv[]) {
+  /*basic config*/
+  int batch_size = 256;
+  int max_epo = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  /*context and net symbol*/
+  auto ctx = Context::gpu();
+  auto Net = AlexnetSymbol(10);
+
+  /*args_map and aux_map is used for parameters' saving*/
+  map<string, NDArray> args_map;
+  map<string, NDArray> aux_map;
+
+  /*we should tell mxnet the shape of data and label*/
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
+  args_map["label"] = NDArray(Shape(batch_size), ctx);
+
+  /*with data and label, executor can be generated automatically*/
+  auto *exec = Net.SimpleBind(ctx, args_map);
+  aux_map = exec->aux_dict();
+  args_map = exec->arg_dict();
+
+  /*if fine tune from some pre-trained model, we should load the parameters*/
+  // NDArray::Load("./model/alex_params_3", nullptr, &args_map);
+  /*else, we should use initializer Xavier to init the params*/
+  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+  for (auto &arg : args_map) {
+    /*be careful here, the arg's name must has some specific ends or starts for
+     * initializer to call*/
+    xavier(arg.first, &arg.second);
+  }
+  /*print out to check the shape of the net*/
+  for (const auto &s : Net.ListArguments()) {
+    LG << s;
+    const auto &k = args_map[s].GetShape();
+    for (const auto &i : k) {
+      cout << i << " ";
+    }
+    cout << endl;
+  }
+
+  /*these binary files should be generated using im2rc tools, which can be found
+   * in mxnet/bin*/
+  auto train_iter = MXDataIter("ImageRecordIter")
+                        .SetParam("path_imglist", "./data/train_rec.lst")
+                        .SetParam("path_imgrec", "./data/train_rec.bin")
+                        .SetParam("data_shape", Shape(3, 256, 256))
+                        .SetParam("batch_size", batch_size)
+                        .SetParam("shuffle", 1)
+                        .CreateDataIter();
+  auto val_iter = MXDataIter("ImageRecordIter")
+                      .SetParam("path_imglist", "./data/val_rec.lst")
+                      .SetParam("path_imgrec", "./data/val_rec.bin")
+                      .SetParam("data_shape", Shape(3, 256, 256))
+                      .SetParam("batch_size", batch_size)
+                      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  Accuracy acu_train, acu_val;
+  LogLoss logloss_val;
+  for (int iter = 0; iter < max_epo; ++iter) {
+    LG << "Train Epoch: " << iter;
+    /*reset the metric every epoch*/
+    acu_train.Reset();
+    /*reset the data iter every epoch*/
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto batch = train_iter.GetDataBatch();
+      LG << train_iter.GetDataBatch().index.size();
+      /*use copyto to feed new data and label to the executor*/
+      batch.data.CopyTo(&args_map["data"]);
+      batch.label.CopyTo(&args_map["label"]);
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+      acu_train.Update(batch.label, exec->outputs[0]);
+    }
+    LG << "ITER: " << iter << " Train Accuracy: " << acu_train.Get();
+
+    LG << "Val Epoch: " << iter;
+    acu_val.Reset();
+    val_iter.Reset();
+    logloss_val.Reset();
+    while (val_iter.Next()) {
+      auto batch = val_iter.GetDataBatch();
+      LG << val_iter.GetDataBatch().index.size();
+      batch.data.CopyTo(&args_map["data"]);
+      batch.label.CopyTo(&args_map["label"]);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu_val.Update(batch.label, exec->outputs[0]);
+      logloss_val.Update(batch.label, exec->outputs[0]);
+    }
+    LG << "ITER: " << iter << " Val Accuracy: " << acu_val.Get();
+    LG << "ITER: " << iter << " Val LogLoss: " << logloss_val.Get();
+
+    /*save the parameters*/
+    stringstream ss;
+    ss << iter;
+    string iter_str;
+    ss >> iter_str;
+    string save_path_param = "./model/alex_param_" + iter_str;
+    auto save_args = args_map;
+    /*we do not want to save the data and label*/
+    save_args.erase(save_args.find("data"));
+    save_args.erase(save_args.find("label"));
+    /*the alexnet does not get any aux array, so we do not need to save
+     * aux_map*/
+    LG << "ITER: " << iter << " Saving to..." << save_path_param;
+    NDArray::Save(save_path_param, save_args);
+  }
+  /*don't foget to release the executor*/
+  delete exec;
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
new file mode 100644
index 000000000000..daf31a4fe69c
--- /dev/null
+++ b/cpp-package/example/charRNN.cpp
@@ -0,0 +1,719 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * Hua Zhang mz24cn@hotmail.com
+ * The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API.
+ * The generated params file is compatiable with python version.
+ * train() and predict() has been verified with original data samples.
+ * 2017/1/23:
+ * Add faster version charRNN based on built-in cuDNN RNN operator, 10 times faster.
+ * Add time major computation graph, although no substantial performance difference.
+ * Support continuing training from last params file.
+ * Rename params file epoch number starts from zero.
+ */
+
+#pragma warning(disable: 4996)  // VS2015 complains on 'std::copy' ...
+#include <cstring>
+#include <iostream>
+#include <fstream>
+#include <unordered_map>
+#include <vector>
+#include <string>
+#include <tuple>
+#include <algorithm>
+#include <functional>
+#include <thread>
+#include <chrono>
+#include "mxnet-cpp/MxNetCpp.h"
+
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+struct LSTMState {
+  Symbol C;
+  Symbol h;
+};
+
+struct LSTMParam {
+  Symbol i2h_weight;
+  Symbol i2h_bias;
+  Symbol h2h_weight;
+  Symbol h2h_bias;
+};
+
+bool TIME_MAJOR = true;
+
+// LSTM Cell symbol
+LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& prev_state,
+    const LSTMParam& param, int seqidx, int layeridx, mx_float dropout = 0) {
+  auto input = dropout > 0? Dropout(indata, dropout) : indata;
+  auto prefix = string("t") + to_string(seqidx) + "_l" + to_string(layeridx);
+  auto i2h = FullyConnected(prefix + "_i2h", input, param.i2h_weight, param.i2h_bias,
+      num_hidden * 4);
+  auto h2h = FullyConnected(prefix + "_h2h", prev_state.h, param.h2h_weight, param.h2h_bias,
+      num_hidden * 4);
+  auto gates = i2h + h2h;
+  auto slice_gates = SliceChannel(prefix + "_slice", gates, 4);
+  auto in_gate = Activation(slice_gates[0], ActivationActType::sigmoid);
+  auto in_transform = Activation(slice_gates[1], ActivationActType::tanh);
+  auto forget_gate = Activation(slice_gates[2], ActivationActType::sigmoid);
+  auto out_gate = Activation(slice_gates[3], ActivationActType::sigmoid);
+
+  LSTMState state;
+  state.C = (forget_gate * prev_state.C) + (in_gate * in_transform);
+  state.h = out_gate * Activation(state.C, ActivationActType::tanh);
+  return state;
+}
+
+Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
+        int num_hidden, int num_embed, mx_float dropout = 0) {
+  auto isTrain = sequence_length > 1;
+  auto data = Symbol::Variable("data");
+  if (TIME_MAJOR && isTrain)
+    data = transpose(data);
+  auto embed_weight = Symbol::Variable("embed_weight");
+  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
+  auto wordvec = isTrain? SliceChannel(embed, sequence_length, TIME_MAJOR? 0 : 1, true) : embed;
+
+  vector<LSTMState> last_states;
+  vector<LSTMParam> param_cells;
+  for (int l = 0; l < num_lstm_layer; l++) {
+    string layer = "l" + to_string(l);
+    LSTMParam param;
+    param.i2h_weight = Symbol::Variable(layer + "_i2h_weight");
+    param.i2h_bias = Symbol::Variable(layer + "_i2h_bias");
+    param.h2h_weight = Symbol::Variable(layer + "_h2h_weight");
+    param.h2h_bias = Symbol::Variable(layer + "_h2h_bias");
+    param_cells.push_back(param);
+    LSTMState state;
+    state.C = Symbol::Variable(layer + "_init_c");
+    state.h = Symbol::Variable(layer + "_init_h");
+    last_states.push_back(state);
+  }
+
+  vector<Symbol> hidden_all;
+  for (int i = 0; i < sequence_length; i++) {
+    auto hidden = wordvec[i];
+    for (int layer = 0; layer < num_lstm_layer; layer++) {
+      double dp_ratio = layer == 0? 0 : dropout;
+      auto next_state = LSTM(num_hidden, hidden, last_states[layer], param_cells[layer],
+          i, layer, dp_ratio);
+      hidden = next_state.h;
+      last_states[layer] = next_state;
+    }
+    if (dropout > 0)
+      hidden = Dropout(hidden, dropout);
+    hidden_all.push_back(hidden);
+  }
+
+  auto hidden_concat = isTrain? Concat(hidden_all, hidden_all.size(), 0) : hidden_all[0];
+  auto cls_weight = Symbol::Variable("cls_weight");
+  auto cls_bias = Symbol::Variable("cls_bias");
+  auto pred = FullyConnected("pred", hidden_concat, cls_weight, cls_bias, input_dim);
+
+  auto label = Symbol::Variable("softmax_label");
+  label = transpose(label);
+  label = Reshape(label, Shape(), false, false, Shape(-1));  // -1: infer from graph
+  auto sm = SoftmaxOutput("softmax", pred, label);
+  if (isTrain)
+    return sm;
+
+  vector<Symbol> outputs = { sm };
+  for (auto& state : last_states) {
+    outputs.push_back(state.C);
+    outputs.push_back(state.h);
+  }
+  return Symbol::Group(outputs);
+}
+
+// Currently mxnet GPU version RNN operator is implemented via *fast* NVIDIA cuDNN.
+Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int input_dim,
+ int num_hidden, int num_embed, mx_float dropout = 0) {
+  auto isTrain = sequence_length > 1;
+  auto data = Symbol::Variable("data");
+  if (TIME_MAJOR && isTrain)
+    data = transpose(data);
+
+  auto embed_weight = Symbol::Variable("embed_weight");
+  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
+  auto label = Symbol::Variable("softmax_label");
+  label = transpose(label);
+  label = Reshape(label, Shape(), false,
+                  false, Shape(-1));  // FullyConnected requires one dimension
+  if (!TIME_MAJOR && isTrain)
+    embed = SwapAxis(embed, 0, 1);  // Change to time-major as cuDNN requires
+
+  // We need not do the SwapAxis op as python version does. Direct and better performance in C++!
+  auto rnn_h_init = Symbol::Variable("LSTM_init_h");
+  auto rnn_c_init = Symbol::Variable("LSTM_init_c");
+  auto rnn_params = Symbol::Variable("LSTM_parameters");  // See explanations near RNNXavier class
+  auto rnn = RNN(embed, rnn_params, rnn_h_init, rnn_c_init, num_hidden, num_lstm_layer,
+      RNNMode::lstm, false, dropout, !isTrain);
+  auto hidden = Reshape(rnn[0], Shape(), false, false, Shape(-1, num_hidden));
+
+  auto cls_weight = Symbol::Variable("cls_weight");
+  auto cls_bias = Symbol::Variable("cls_bias");
+  auto pred = FullyConnected("pred", hidden, cls_weight, cls_bias, input_dim);
+  /*In rnn-time-major/rnn_cell_demo.py, the author claimed time-major version speeds up
+   * 1.5~2 times versus batch version. I doubts on the conclusion. In my test, the performance
+   * of both codes are almost same. In fact, there are no substantially differences between
+   * two codes. They are both based on time major cuDNN, the computation graph only differs
+   * slightly on the choices of where to put Reshape/SwapAxis/transpose operation. Here I don't
+   * use Reshape on pred and keep label shape on SoftmaxOutput like time major version code,
+   * but Reshape on label for simplification. It doesn't make influence on performacne. */
+
+  auto sm = SoftmaxOutput("softmax", pred, label);
+  if (isTrain)
+    return sm;
+  else
+    return Symbol::Group({ sm, rnn[1/*RNNOpOutputs::kStateOut=1*/],
+    rnn[2/*RNNOpOutputs::kStateCellOut=2*/] });
+}
+
+class Shuffler {
+  vector<int> sequence;
+ public:
+  explicit Shuffler(int size) : sequence(size) {
+    int* p = sequence.data();
+    for (int i = 0; i < size; i++)
+      *p++ = i;
+  }
+  void shuffle(function<void(int, int)> lambda = nullptr) {
+    random_shuffle(sequence.begin(), sequence.end());
+    int n = 0;
+    if (lambda != nullptr)
+      for (int i : sequence)
+        lambda(n++, i);
+  }
+  const int* data() {
+    return sequence.data();
+  }
+};
+
+class BucketSentenceIter : public DataIter {
+  Shuffler* random;
+  int batch, current, end, sequence_length;
+  Context device;
+  vector<vector<mx_float>> sequences;
+  vector<wchar_t> index2chars;
+  unordered_map<wchar_t, mx_float> charIndices;
+
+ public:
+  BucketSentenceIter(string filename, int minibatch, Context context) : batch(minibatch),
+  current(-1), device(context) {
+    auto content = readContent(filename);
+    buildCharIndex(content);
+    sequences = convertTextToSequences(content, '\n');
+
+    int N = sequences.size() / batch * batch;  // total used samples
+    sequences.resize(N);
+    sort(sequences.begin(), sequences.end(), [](const vector<mx_float>& a,
+        const vector<mx_float>& b) { return a.size() < b.size(); });
+
+    sequence_length = sequences.back().size();
+    random = new Shuffler(N);
+    // We still can get random results if call Reset() firstly
+//    vector<vector<mx_float>>* target = &sequences;
+//    random->shuffle([target](int n, int i) { (*target)[n].swap((*target)[i]); });
+    end = N / batch;
+  }
+  virtual ~BucketSentenceIter() {
+    delete random;
+  }
+
+  unsigned int maxSequenceLength() {
+    return sequence_length;
+  }
+
+  size_t characterSize() {
+    return charIndices.size();
+  }
+
+  virtual bool Next(void) {
+    return ++current < end;
+  }
+  virtual NDArray GetData(void) {
+    const int* indices = random->data();
+    mx_float *data = new mx_float[sequence_length * batch], *pdata = data;
+
+    for (int i = current * batch, end = i + batch; i < end; i++) {
+      memcpy(pdata, sequences[indices[i]].data(), sequences[indices[i]].size() * sizeof(mx_float));
+      if (sequences[indices[i]].size() < sequence_length)
+        memset(pdata + sequences[indices[i]].size(), 0,
+            (sequence_length - sequences[indices[i]].size()) * sizeof(mx_float));
+      pdata += sequence_length;
+    }
+    NDArray array(Shape(batch, sequence_length), device, false);
+    array.SyncCopyFromCPU(data, batch * sequence_length);
+    return array;
+  }
+  virtual NDArray GetLabel(void) {
+    const int* indices = random->data();
+    mx_float *label = new mx_float[sequence_length * batch], *plabel = label;
+
+    for (int i = current * batch, end = i + batch; i < end; i++) {
+      memcpy(plabel, sequences[indices[i]].data() + 1,
+          (sequences[indices[i]].size() - 1) * sizeof(mx_float));
+      memset(plabel + sequences[indices[i]].size() - 1, 0,
+          (sequence_length - sequences[indices[i]].size() + 1) * sizeof(mx_float));
+      plabel += sequence_length;
+    }
+    NDArray array(Shape(batch, sequence_length), device, false);
+    array.SyncCopyFromCPU(label, batch * sequence_length);
+    return array;
+  }
+  virtual int GetPadNum(void) {
+    return sequence_length - sequences[random->data()[current * batch]].size();
+  }
+  virtual std::vector<int> GetIndex(void) {
+    const int* indices = random->data();
+    vector<int> list(indices + current * batch, indices + current * batch + batch);
+    return list;
+  }
+  virtual void BeforeFirst(void) {
+    current = -1;
+    random->shuffle(nullptr);
+  }
+
+  wstring readContent(const string file) {
+    wifstream ifs(file, ios::binary);
+    if (ifs) {
+      wostringstream os;
+      os << ifs.rdbuf();
+      return os.str();
+    }
+    return L"";
+  }
+
+  void buildCharIndex(const wstring& content) {
+  // This version buildCharIndex() Compatiable with python version char_rnn dictionary
+    int n = 1;
+    charIndices['\0'] = 0;  // padding character
+    index2chars.push_back(0);  // padding character index
+    for (auto c : content)
+      if (charIndices.find(c) == charIndices.end()) {
+        charIndices[c] = n++;
+        index2chars.push_back(c);
+      }
+  }
+//  void buildCharIndex(wstring& content) {
+//    for (auto c : content)
+//      charIndices[c]++; // char-frequency map; then char-index map
+//    vector<tuple<wchar_t, mx_float>> characters;
+//    for (auto& iter : charIndices)
+//      characters.push_back(make_tuple(iter.first, iter.second));
+//    sort(characters.begin(), characters.end(), [](const tuple<wchar_t, mx_float>& a,
+//      const tuple<wchar_t, mx_float>& b) { return get<1>(a) > get<1>(b); });
+//    mx_float index = 1; //0 is left for zero-padding
+//    index2chars.clear();
+//    index2chars.push_back(0); //zero-padding
+//    for (auto& t : characters) {
+//      charIndices[get<0>(t)] = index++;
+//      index2chars.push_back(get<0>(t));
+//    }
+//  }
+
+  inline wchar_t character(int i) {
+    return index2chars[i];
+  }
+
+  inline mx_float index(wchar_t c) {
+    return charIndices[c];
+  }
+
+  void saveCharIndices(const string file) {
+    wofstream ofs(file, ios::binary);
+    if (ofs) {
+      ofs.write(index2chars.data() + 1, index2chars.size() - 1);
+      ofs.close();
+    }
+  }
+
+  static tuple<unordered_map<wchar_t, mx_float>, vector<wchar_t>> loadCharIndices(
+      const string file) {
+    wifstream ifs(file, ios::binary);
+    unordered_map<wchar_t, mx_float> map;
+    vector<wchar_t> chars;
+    if (ifs) {
+      wostringstream os;
+      os << ifs.rdbuf();
+      int n = 1;
+      map[L'\0'] = 0;
+      chars.push_back(L'\0');
+      for (auto c : os.str()) {
+        map[c] = (mx_float) n++;
+        chars.push_back(c);
+      }
+    }
+    // Note: Can't use {} because this would hit the explicit constructor
+    return tuple<unordered_map<wchar_t, mx_float>, vector<wchar_t>>(map, chars);
+  }
+
+  vector<vector<mx_float>> convertTextToSequences(const wstring& content, wchar_t spliter) {
+    vector<vector<mx_float>> sequences;
+    sequences.push_back(vector<mx_float>());
+    for (auto c : content)
+      if (c == spliter && !sequences.back().empty())
+        sequences.push_back(vector<mx_float>());
+      else
+        sequences.back().push_back(charIndices[c]);
+    return sequences;
+  }
+};
+
+void OutputPerplexity(NDArray* labels, NDArray* output) {
+  vector<mx_float> charIndices, a;
+  labels->SyncCopyToCPU(&charIndices, 0L);  // 0L indicates all
+  output->SyncCopyToCPU(&a, 0L)/*4128*84*/;
+  mx_float loss = 0;
+  int batchSize = labels->GetShape()[0]/*32*/, sequenceLength = labels->GetShape()[1]/*129*/,
+      nSamples = output->GetShape()[0]/*4128*/, vocabSize = output->GetShape()[1]/*84*/;
+  for (int n = 0; n < nSamples; n++) {
+    int row = n % batchSize, column = n / batchSize, labelOffset = column +
+        row * sequenceLength;  // Search based on column storage: labels.T
+    mx_float safe_value = max(1e-10f, a[vocabSize * n +
+                                    static_cast<int>(charIndices[labelOffset])]);
+    loss += -log(safe_value);  // Calculate negative log-likelihood
+  }
+  loss = exp(loss / nSamples);
+  cout << "Train-Perplexity=" << loss << endl;
+}
+
+void SaveCheckpoint(const string filepath, Symbol net, Executor* exe) {
+  map<string, NDArray> params;
+  for (auto iter : exe->arg_dict())
+    if (iter.first.find("_init_") == string::npos
+        && iter.first.rfind("data") != iter.first.length() - 4
+        && iter.first.rfind("label") != iter.first.length() - 5)
+      params.insert({"arg:" + iter.first, iter.second});
+  for (auto iter : exe->aux_dict())
+      params.insert({"aux:" + iter.first, iter.second});
+  NDArray::Save(filepath, params);
+}
+
+void LoadCheckpoint(const string filepath, Executor* exe) {
+  map<std::string, NDArray> params = NDArray::LoadToMap(filepath);
+  for (auto iter : params) {
+    string type = iter.first.substr(0, 4);
+    string name = iter.first.substr(4);
+    NDArray target;
+    if (type == "arg:")
+      target = exe->arg_dict()[name];
+    else if (type == "aux:")
+      target = exe->aux_dict()[name];
+    else
+      continue;
+    iter.second.CopyTo(&target);
+  }
+}
+
+int input_dim = 0;/*84*/
+int sequence_length_max = 0;/*129*/
+int num_embed = 256;
+int num_lstm_layer = 3;
+int num_hidden = 512;
+mx_float dropout = 0.2;
+void train(const string file, int batch_size, int max_epoch, int start_epoch) {
+  Context device(DeviceType::kGPU, 0);
+  BucketSentenceIter dataIter(file, batch_size, device);
+  string prefix = file.substr(0, file.rfind("."));
+  dataIter.saveCharIndices(prefix + ".dictionary");
+
+  input_dim = static_cast<int>(dataIter.characterSize());
+  sequence_length_max = dataIter.maxSequenceLength();
+
+  auto RNN = LSTMUnroll(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
+      num_embed, dropout);
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  for (int i = 0; i < num_lstm_layer; i++) {
+    string key = "l" + to_string(i) + "_init_";
+    args_map[key + "c"] = NDArray(Shape(batch_size, num_hidden), device, false);
+    args_map[key + "h"] = NDArray(Shape(batch_size, num_hidden), device, false);
+  }
+  vector<mx_float> zeros(batch_size * num_hidden, 0);
+  // RNN.SimpleBind(device, args_map, {}, {{"data", kNullOp}});
+  Executor* exe = RNN.SimpleBind(device, args_map);
+
+  if (start_epoch == -1) {
+    Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
+    for (auto &arg : exe->arg_dict())
+      xavier(arg.first, &arg.second);
+  } else {
+    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
+  }
+  start_epoch++;
+
+  mx_float learning_rate = 0.0002;
+  mx_float weight_decay = 0.000002;
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
+//  ->SetParam("clip_gradient", 10);
+
+  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
+    dataIter.Reset();
+    auto tic = chrono::system_clock::now();
+    while (dataIter.Next()) {
+      auto data_batch = dataIter.GetDataBatch();
+      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
+      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
+      for (int l = 0; l < num_lstm_layer; l++) {
+        string key = "l" + to_string(l) + "_init_";
+        exe->arg_dict()[key + "c"].SyncCopyFromCPU(zeros);
+        exe->arg_dict()[key + "h"].SyncCopyFromCPU(zeros);
+      }
+      NDArray::WaitAll();
+
+      exe->Forward(true);
+      exe->Backward();
+      exe->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+    auto toc = chrono::system_clock::now();
+    cout << "Epoch[" << epoch << "] Time Cost:" <<
+        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
+    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
+    string filepath = prefix + "-" + to_string(epoch) + ".params";
+    SaveCheckpoint(filepath, RNN, exe);
+  }
+}
+
+/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
+ * variable name, cannot initialize LSTM_parameters. Thus it was renamed to LSTM_bias,
+ * which can be initialized as zero. But it cannot converge after 100 epochs in this corpus
+ * example. Using RNNXavier, after 15 oscillating epochs,  it rapidly converges like old
+ * LSTMUnroll version. */
+class RNNXavier : public Xavier {
+ public:
+  RNNXavier(RandType rand_type = gaussian, FactorType factor_type = avg,
+    float magnitude = 3) : Xavier(rand_type, factor_type, magnitude) {
+  }
+  virtual ~RNNXavier() {}
+ protected:
+  virtual void InitDefault(NDArray* arr) {
+    Xavier::InitWeight(arr);
+  }
+};
+
+void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int start_epoch) {
+  Context device(DeviceType::kGPU, 0);
+  BucketSentenceIter dataIter(file, batch_size, device);
+  string prefix = file.substr(0, file.rfind("."));
+  dataIter.saveCharIndices(prefix + ".dictionary");
+
+  input_dim = static_cast<int>(dataIter.characterSize());
+  sequence_length_max = dataIter.maxSequenceLength();
+
+  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
+      num_embed, dropout);
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  // Avoiding SwapAxis, batch_size is of second dimension.
+  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
+  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
+  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
+  vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
+  Executor* exe = RNN.SimpleBind(device, args_map);
+
+  if (start_epoch == -1) {
+    RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34);
+    for (auto &arg : exe->arg_dict())
+      xavier(arg.first, &arg.second);
+  } else {
+    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
+  }
+  start_epoch++;
+
+  mx_float learning_rate = 0.0002;
+  mx_float weight_decay = 0.000002;
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
+//  ->SetParam("clip_gradient", 10);
+
+  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
+    dataIter.Reset();
+    auto tic = chrono::system_clock::now();
+    while (dataIter.Next()) {
+      auto data_batch = dataIter.GetDataBatch();
+      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
+      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
+      exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros);
+      exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros);
+      NDArray::WaitAll();
+
+      exe->Forward(true);
+      exe->Backward();
+      exe->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+    auto toc = chrono::system_clock::now();
+    cout << "Epoch[" << epoch << "] Time Cost:" <<
+        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
+    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
+    string filepath = prefix + "-" + to_string(epoch) + ".params";
+    SaveCheckpoint(filepath, RNN, exe);
+  }
+}
+
+void predict(wstring* ptext, int sequence_length, const string param_file,
+    const string dictionary_file) {
+  Context device(DeviceType::kGPU, 0);
+  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
+  auto dictionary = get<0>(results);
+  auto charIndices = get<1>(results);
+  input_dim = static_cast<int>(charIndices.size());
+  auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
+
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(1, 1), device, false);
+  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
+  vector<mx_float> zeros(1 * num_hidden, 0);
+  for (int l = 0; l < num_lstm_layer; l++) {
+    string key = "l" + to_string(l) + "_init_";
+    args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false);
+    args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false);
+    args_map[key + "c"].SyncCopyFromCPU(zeros);
+    args_map[key + "h"].SyncCopyFromCPU(zeros);
+  }
+  Executor* exe = RNN.SimpleBind(device, args_map);
+  LoadCheckpoint(param_file, exe);
+
+  mx_float index;
+  wchar_t next;
+  vector<mx_float> softmax;
+  softmax.resize(input_dim);
+  for (auto c : *ptext) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    for (int l = 0; l < num_lstm_layer; l++) {
+      string key = "l" + to_string(l) + "_init_";
+      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
+      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
+    }
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+  }
+  ptext->push_back(next);
+
+  for (int i = 0; i < sequence_length; i++) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    for (int l = 0; l < num_lstm_layer; l++) {
+      string key = "l" + to_string(l) + "_init_";
+      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
+      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
+    }
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+    ptext->push_back(next);
+  }
+}
+
+void predictWithBuiltInRNNOp(wstring* ptext, int sequence_length, const string param_file,
+  const string dictionary_file) {
+  Context device(DeviceType::kGPU, 0);
+  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
+  auto dictionary = get<0>(results);
+  auto charIndices = get<1>(results);
+  input_dim = static_cast<int>(charIndices.size());
+  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
+
+  map<string, NDArray> args_map;
+  args_map["data"] = NDArray(Shape(1, 1), device, false);
+  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
+  vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
+  // Avoiding SwapAxis, batch_size=1 is of second dimension.
+  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
+  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
+  args_map["LSTM_init_c"].SyncCopyFromCPU(zeros);
+  args_map["LSTM_init_h"].SyncCopyFromCPU(zeros);
+  Executor* exe = RNN.SimpleBind(device, args_map);
+  LoadCheckpoint(param_file, exe);
+
+  mx_float index;
+  wchar_t next;
+  vector<mx_float> softmax;
+  softmax.resize(input_dim);
+  for (auto c : *ptext) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
+    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+  }
+  ptext->push_back(next);
+
+  for (int i = 0; i < sequence_length; i++) {
+    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
+    exe->Forward(false);
+
+    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
+    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
+    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
+
+    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
+    index = (mx_float) n;
+    next = charIndices[n];
+    ptext->push_back(next);
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 5) {
+    cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}"
+            " {batch size} {max epoch} [{starting epoch}]" << endl;
+    cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}"
+            " {dictionary file} {beginning of text}" << endl;
+    cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor"
+            " are not compatible with each other." << endl;
+    return 0;
+  }
+
+  string task = argv[1];
+  bool builtIn = task.find("BuiltIn") != string::npos;
+  TIME_MAJOR = task.find("TimeMajor") != string::npos;
+  cout << "use BuiltIn cuDNN RNN: " << builtIn << endl
+         << "use data as TimeMajor: " << TIME_MAJOR << endl;
+  if (task.find("train") == 0) {
+    cout << "train batch size:      " << argv[3] << endl
+           << "train max epoch:       " << argv[4] << endl;
+    int start_epoch = argc > 5? atoi(argv[5]) : -1;
+    // this function will generate dictionary file and params file.
+    if (builtIn)
+      trainWithBuiltInRNNOp(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);
+    else
+      train(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);  // ditto
+  } else if (task.find("predict") == 0) {
+    wstring text;  // = L"If there is anyone out there who still doubts ";
+    // Considering of extending to Chinese samples in future, use wchar_t instead of char
+    for (char c : string(argv[4]))
+      text.push_back((wchar_t) c);
+    /*Python version predicts text default to random selecltions. Here I didn't write the random
+    code, always choose the 'best' character. So the text length reduced to 600. Longer size often
+    leads to repeated sentances, since training sequence length is only 129 for obama corpus.*/
+    if (builtIn)
+      predictWithBuiltInRNNOp(&text, 600, argv[2], argv[3]);
+    else
+      predict(&text, 600, argv[2], argv[3]);
+    wcout << text << endl;
+  }
+
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/example.mk b/cpp-package/example/example.mk
new file mode 100644
index 000000000000..3f3016a5aa24
--- /dev/null
+++ b/cpp-package/example/example.mk
@@ -0,0 +1,22 @@
+CPPEX_SRC = $(wildcard cpp-package/example/*.cpp)
+CPPEX_EXE = $(patsubst cpp-package/example/%.cpp, build/cpp-package/example/%, $(CPPEX_SRC))
+
+CPPEX_CFLAGS += -Icpp-package/include -Ibuild/cpp-package/include
+CPPEX_EXTRA_LDFLAGS := -L$(ROOTDIR)/lib -lmxnet
+
+EXTRA_PACKAGES += cpp-package-example-all
+EXTRA_PACKAGES_CLEAN += cpp-package-example-clean
+
+.PHONY: cpp-package-example-all cpp-package-example-clean
+
+cpp-package-example-all: cpp-package-all $(CPPEX_EXE)
+
+build/cpp-package/example/% : cpp-package/example/%.cpp lib/libmxnet.so $(CPP_PACKAGE_OP_H_FILE)
+	@mkdir -p $(@D)
+	$(CXX) -std=c++0x $(CFLAGS) $(CPPEX_CFLAGS) -MM -MT cpp-package/example/$* $< >build/cpp-package/example//$*.d
+	$(CXX) -std=c++0x $(CFLAGS) $(CPPEX_CFLAGS) -o $@ $(filter %.cpp %.a, $^) $(LDFLAGS) $(CPPEX_EXTRA_LDFLAGS)
+
+cpp-package-example-clean:
+	rm -rf build/cpp-package/example/*
+
+-include build/cpp-package/example/*.d
diff --git a/cpp-package/example/feature_extract/Makefile b/cpp-package/example/feature_extract/Makefile
new file mode 100644
index 000000000000..808f2613b001
--- /dev/null
+++ b/cpp-package/example/feature_extract/Makefile
@@ -0,0 +1,26 @@
+CXX=g++
+BLAS=-L /opt/openblas/lib -lopenblas -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0 
+CUDA=-DMSHADOW_USE_CUDA=1
+OPENCV_CFLAGS=`pkg-config --cflags opencv`
+OPENCV_LDFLAGS=`pkg-config --libs opencv`
+
+#COMMFLAGS=-static -static-libgcc -static-libstdc++
+
+CFLAGS=$(COMMFLAGS) -I ../../include -Wall -O3 -msse3 -funroll-loops -Wno-unused-parameter -Wno-unknown-pragmas -fopenmp 
+LDFLAGS=$(COMMFLAGS) -L ../../lib/linux -lmxnet $(BLAS) $(CUDA) -lgomp -pthread
+
+all: feature_extract prepare_data_with_opencv
+
+feature_extract: ./feature_extract.cpp
+	$(CXX) -c -std=c++0x $(CFLAGS) $^
+	$(CXX) $(basename $@).o -o $@ $(LDFLAGS)
+	-rm -f $(basename $@).o
+
+prepare_data_with_opencv: ./prepare_data_with_opencv.cpp
+	$(CXX) -c -std=c++0x $(OPENCV_CFLAGS) $^ 
+	$(CXX) $(basename $@).o -o $@ $(OPENCV_LDFLAGS)
+	-rm -f $(basename $@).o
+
+clean:
+	-rm -f feature_extract
+	-rm -f prepare_data_with_opencv
diff --git a/cpp-package/example/feature_extract/feature_extract.cpp b/cpp-package/example/feature_extract/feature_extract.cpp
new file mode 100644
index 000000000000..21853a3912e7
--- /dev/null
+++ b/cpp-package/example/feature_extract/feature_extract.cpp
@@ -0,0 +1,120 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+using namespace std;
+using namespace mxnet::cpp;
+
+/*
+ * This example shows how to extract features with a pretrained model.
+ * Get the model here:
+ *   https://github.com/dmlc/mxnet-model-gallery
+ * */
+
+/*The global context, change them if necessary*/
+Context global_ctx(kGPU, 0);
+// Context global_ctx(kCPU,0);
+
+class FeatureExtractor {
+ private:
+  /*the mean image, get from the pretrained model*/
+  NDArray mean_img;
+  /*the following two maps store all the paramters need by the model*/
+  map<string, NDArray> args_map;
+  map<string, NDArray> aux_map;
+  Symbol net;
+  Executor *executor;
+  /*Get the feature layer we want to extract*/
+  void GetFeatureSymbol() {
+    /*
+     * use the following to check all the layers' names:
+     * */
+    /*
+    net=Symbol::Load("./model/Inception_BN-symbol.json").GetInternals();
+    for(const auto & layer_name:net.ListOutputs()){
+      LG<<layer_name;
+    }
+    */
+    net = Symbol::Load("./model/Inception_BN-symbol.json")
+              .GetInternals()["global_pool_output"];
+  }
+  /*Fill the trained paramters into the model, a.k.a. net, executor*/
+  void LoadParameters() {
+    map<string, NDArray> paramters;
+    NDArray::Load("./model/Inception_BN-0039.params", 0, &paramters);
+    for (const auto &k : paramters) {
+      if (k.first.substr(0, 4) == "aux:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        aux_map[name] = k.second.Copy(global_ctx);
+      }
+      if (k.first.substr(0, 4) == "arg:") {
+        auto name = k.first.substr(4, k.first.size() - 4);
+        args_map[name] = k.second.Copy(global_ctx);
+      }
+    }
+    /*WaitAll is need when we copy data between GPU and the main memory*/
+    NDArray::WaitAll();
+  }
+  void GetMeanImg() {
+    mean_img = NDArray(Shape(1, 3, 224, 224), global_ctx, false);
+    mean_img.SyncCopyFromCPU(
+        NDArray::LoadToMap("./model/mean_224.nd")["mean_img"].GetData(),
+        1 * 3 * 224 * 224);
+    NDArray::WaitAll();
+  }
+
+ public:
+  FeatureExtractor() {
+    /*prepare the model, fill the pretrained parameters, get the mean image*/
+    GetFeatureSymbol();
+    LoadParameters();
+    GetMeanImg();
+  }
+
+  void Extract(NDArray data) {
+    /*Normalize the pictures*/
+    data.Slice(0, 1) -= mean_img;
+    data.Slice(1, 2) -= mean_img;
+    args_map["data"] = data;
+    /*bind the excutor*/
+    executor = net.SimpleBind(global_ctx, args_map, map<string, NDArray>(),
+                              map<string, OpReqType>(), aux_map);
+    executor->Forward(false);
+    /*print out the features*/
+    auto array = executor->outputs[0].Copy(Context(kCPU, 0));
+    NDArray::WaitAll();
+    for (int i = 0; i < 1024; ++i) {
+      cout << array.At(0, i) << ",";
+    }
+    cout << endl;
+  }
+};
+
+NDArray Data2NDArray() {
+  NDArray ret(Shape(2, 3, 224, 224), global_ctx, false);
+  ifstream inf("./img.dat", ios::binary);
+  vector<float> data(2 * 3 * 224 * 224);
+  inf.read(reinterpret_cast<char *>data.data(), 2 * 3 * 224 * 224 * sizeof(float));
+  inf.close();
+  ret.SyncCopyFromCPU(data.data(), 2 * 3 * 224 * 224);
+  NDArray::WaitAll();
+  return ret;
+}
+
+int main() {
+  /*
+   * get the data from a binary file ./img.data
+   * this file is generated by ./prepare_data_with_opencv
+   * it stores 2 pictures in NDArray format
+   *
+   */
+  auto data = Data2NDArray();
+  FeatureExtractor fe;
+  fe.Extract(data);
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
new file mode 100644
index 000000000000..20cbe140fc09
--- /dev/null
+++ b/cpp-package/example/feature_extract/prepare_data_with_opencv.cpp
@@ -0,0 +1,37 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+using namespace std;
+
+/*read images and store them the NDArray format that MXNet.cpp can handle*/
+void Mat2Array() {
+  string file_name_list[] = {"./1.jpg", "./2.jpg"};
+
+  std::vector<float> array;
+  for (auto &t : file_name_list) {
+    cv::Mat mat = cv::imread(t);
+    /*resize pictures to (224, 224) according to the pretrained model*/
+    cv::resize(mat, mat, cv::Size(224, 224));
+    for (int c = 0; c < 3; ++c) {
+      for (int i = 0; i < 224; ++i) {
+        for (int j = 0; j < 224; ++j) {
+          array.push_back(static_cast<float>(mat.data[(i * 224 + j) * 3 + c]));
+        }
+      }
+    }
+  }
+  ofstream outf("./img.dat", ios::binary);
+  outf.write(reinterpret_cast<char *>array.data(), array.size() * sizeof(float));
+  outf.close();
+}
+
+int main(int argc, char *argv[]) {
+  Mat2Array();
+  return 0;
+}
diff --git a/cpp-package/example/feature_extract/run.sh b/cpp-package/example/feature_extract/run.sh
new file mode 100755
index 000000000000..afac492b0a9d
--- /dev/null
+++ b/cpp-package/example/feature_extract/run.sh
@@ -0,0 +1,12 @@
+### To run the this example,
+###
+### 1.
+### Get Inseption-BN model first, from here
+###     https://github.com/dmlc/mxnet-model-gallery
+###
+### 2.
+### Then Prepare 2 pictures, 1.jpg 2.jpg to extract
+
+make
+./prepare_data_with_opencv
+LD_LIBRARY_PATH=../../lib/linux ./feature_extract
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
new file mode 100644
index 000000000000..51ff9f8b7197
--- /dev/null
+++ b/cpp-package/example/googlenet.cpp
@@ -0,0 +1,165 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <string>
+#include <vector>
+#include <map>
+
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvFactory(Symbol data, int num_filter,
+                   Shape kernel,
+                   Shape stride = Shape(1, 1),
+                   Shape pad = Shape(0, 0),
+                   const std::string & name = "",
+                   const std::string & suffix = "") {
+  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
+
+  Symbol conv = Convolution("conv_" + name + suffix, data,
+                            conv_w, conv_b, kernel,
+                            num_filter, stride, Shape(1, 1), pad);
+  return Activation("relu_" + name + suffix, conv, "relu");
+}
+
+Symbol InceptionFactory(Symbol data, int num_1x1, int num_3x3red,
+                        int num_3x3, int num_d5x5red, int num_d5x5,
+                        PoolingPoolType pool, int proj, const std::string & name) {
+  Symbol c1x1 = ConvFactory(data, num_1x1, Shape(1, 1),
+                            Shape(1, 1), Shape(0, 0), name + "_1x1");
+
+  Symbol c3x3r = ConvFactory(data, num_3x3red, Shape(1, 1),
+                             Shape(1, 1), Shape(0, 0), name + "_3x3", "_reduce");
+
+  Symbol c3x3 = ConvFactory(c3x3r, num_3x3, Shape(3, 3),
+                            Shape(1, 1), Shape(1, 1), name + "_3x3");
+
+  Symbol cd5x5r = ConvFactory(data, num_d5x5red, Shape(1, 1),
+                              Shape(1, 1), Shape(0, 0), name + "_5x5", "_reduce");
+
+  Symbol cd5x5 = ConvFactory(cd5x5r, num_d5x5, Shape(5, 5),
+                             Shape(1, 1), Shape(2, 2), name + "_5x5");
+
+  Symbol pooling = Pooling(name + "_pool", data, Shape(3, 3), pool,
+                           false, false, PoolingPoolingConvention::valid,
+                           Shape(1, 1), Shape(1, 1));
+
+  Symbol cproj = ConvFactory(pooling, proj, Shape(1, 1),
+                             Shape(1, 1), Shape(0, 0), name + "_proj");
+
+  std::vector<Symbol> lst;
+  lst.push_back(c1x1);
+  lst.push_back(c3x3);
+  lst.push_back(cd5x5);
+  lst.push_back(cproj);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol GoogleNetSymbol(int num_classes) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  Symbol conv1 = ConvFactory(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
+  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::max,
+                         false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+  Symbol conv2 = ConvFactory(pool1, 64, Shape(1, 1), Shape(1, 1),
+                             Shape(0, 0), "conv2");
+  Symbol conv3 = ConvFactory(conv2, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv3");
+  Symbol pool3 = Pooling("pool3", conv3, Shape(3, 3), PoolingPoolType::max,
+                         false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  Symbol in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, PoolingPoolType::max, 32, "in3a");
+  Symbol in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, PoolingPoolType::max, 64, "in3b");
+  Symbol pool4 = Pooling("pool4", in3b, Shape(3, 3), PoolingPoolType::max,
+                         false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+  Symbol in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, PoolingPoolType::max, 64, "in4a");
+  Symbol in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, PoolingPoolType::max, 64, "in4b");
+  Symbol in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, PoolingPoolType::max, 64, "in4c");
+  Symbol in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, PoolingPoolType::max, 64, "in4d");
+  Symbol in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, PoolingPoolType::max, 128, "in4e");
+  Symbol pool5 = Pooling("pool5", in4e, Shape(3, 3), PoolingPoolType::max,
+                         false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+  Symbol in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, PoolingPoolType::max, 128, "in5a");
+  Symbol in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, PoolingPoolType::max, 128, "in5b");
+  Symbol pool6 = Pooling("pool6", in5b, Shape(7, 7), PoolingPoolType::avg,
+                         false, false, PoolingPoolingConvention::valid, Shape(1, 1));
+
+  Symbol flatten = Flatten("flatten", pool6);
+
+  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, num_classes);
+
+  return SoftmaxOutput("softmax", fc1, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 50;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto googlenet = GoogleNetSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  googlenet.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  auto train_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./train.lst")
+      .SetParam("path_imgrec", "./train.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .CreateDataIter();
+
+  auto val_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./val.lst")
+      .SetParam("path_imgrec", "./_val.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      delete exec;
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+      delete exec;
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
new file mode 100644
index 000000000000..b65611215b7a
--- /dev/null
+++ b/cpp-package/example/inception_bn.cpp
@@ -0,0 +1,192 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace mxnet::cpp;
+
+static const Symbol BN_BETA;
+static const Symbol BN_GAMMA;
+
+Symbol ConvFactoryBN(Symbol data, int num_filter,
+                     Shape kernel, Shape stride, Shape pad,
+                     const std::string & name,
+                     const std::string & suffix = "") {
+  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
+
+  Symbol conv = Convolution("conv_" + name + suffix, data,
+                            conv_w, conv_b, kernel,
+                            num_filter, stride, Shape(1, 1), pad);
+  Symbol bn = BatchNorm("bn_" + name + suffix, conv, BN_GAMMA, BN_BETA);
+  return Activation("relu_" + name + suffix, bn, "relu");
+}
+
+Symbol InceptionFactoryA(Symbol data, int num_1x1, int num_3x3red,
+                         int num_3x3, int num_d3x3red, int num_d3x3,
+                         PoolingPoolType pool, int proj,
+                         const std::string & name) {
+  Symbol c1x1 = ConvFactoryBN(data, num_1x1, Shape(1, 1), Shape(1, 1),
+                              Shape(0, 0), name + "1x1");
+  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1), Shape(1, 1),
+                               Shape(0, 0), name + "_3x3r");
+  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(1, 1),
+                              Shape(1, 1), name + "_3x3");
+  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
+                                Shape(0, 0), name + "_double_3x3", "_reduce");
+  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
+                               Shape(1, 1), name + "_double_3x3_0");
+  cd3x3 = ConvFactoryBN(data = cd3x3, num_d3x3, Shape(3, 3), Shape(1, 1),
+                        Shape(1, 1), name + "_double_3x3_1");
+  Symbol pooling = Pooling(name + "_pool", data,
+                           Shape(3, 3), pool, false, false,
+                           PoolingPoolingConvention::valid,
+                           Shape(1, 1), Shape(1, 1));
+  Symbol cproj = ConvFactoryBN(pooling, proj, Shape(1, 1), Shape(1, 1),
+                               Shape(0, 0), name + "_proj");
+  std::vector<Symbol> lst;
+  lst.push_back(c1x1);
+  lst.push_back(c3x3);
+  lst.push_back(cd3x3);
+  lst.push_back(cproj);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol InceptionFactoryB(Symbol data, int num_3x3red, int num_3x3,
+                         int num_d3x3red, int num_d3x3, const std::string & name) {
+  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1),
+                               Shape(1, 1), Shape(0, 0),
+                               name + "_3x3", "_reduce");
+  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(2, 2),
+                              Shape(1, 1), name + "_3x3");
+  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
+                                Shape(0, 0), name + "_double_3x3", "_reduce");
+  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
+                               Shape(1, 1), name + "_double_3x3_0");
+  cd3x3 = ConvFactoryBN(cd3x3, num_d3x3, Shape(3, 3), Shape(2, 2),
+                        Shape(1, 1), name + "_double_3x3_1");
+  Symbol pooling = Pooling("max_pool_" + name + "_pool", data,
+                           Shape(3, 3), PoolingPoolType::max,
+                           false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+  std::vector<Symbol> lst;
+  lst.push_back(c3x3);
+  lst.push_back(cd3x3);
+  lst.push_back(pooling);
+  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
+}
+
+Symbol InceptionSymbol(int num_classes) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  // stage 1
+  Symbol conv1 = ConvFactoryBN(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
+  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  // stage 2
+  Symbol conv2red = ConvFactoryBN(pool1, 64, Shape(1, 1), Shape(1, 1),  Shape(0, 0), "conv2red");
+  Symbol conv2 = ConvFactoryBN(conv2red, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv2");
+  Symbol pool2 = Pooling("pool2", conv2, Shape(3, 3), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  // stage 3
+  Symbol in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, PoolingPoolType::avg, 32, "3a");
+  Symbol in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, PoolingPoolType::avg, 64, "3b");
+  Symbol in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c");
+
+  // stage 4
+  Symbol in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, PoolingPoolType::avg, 128, "4a");
+  Symbol in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128,  PoolingPoolType::avg, 128, "4b");
+  Symbol in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, PoolingPoolType::avg, 128, "4c");
+  Symbol in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192,  PoolingPoolType::avg, 128, "4d");
+  Symbol in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e");
+
+  // stage 5
+  Symbol in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, PoolingPoolType::avg, 128, "5a");
+  Symbol in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, PoolingPoolType::max, 128, "5b");
+
+  // average pooling
+  Symbol avg = Pooling("global_pool", in5b, Shape(7, 7), PoolingPoolType::avg);
+
+  // classifier
+  Symbol flatten = Flatten("flatten", avg);
+  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+  Symbol fc1 = FullyConnected("fc1", flatten, conv1_w, conv1_b, num_classes);
+  return SoftmaxOutput("softmax", fc1, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 40;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto inception_bn_net = InceptionSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 224, 224), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  inception_bn_net.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  auto train_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./train.lst")
+      .SetParam("path_imgrec", "./train.rec")
+      .SetParam("data_shape", Shape(3, 224, 224))
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .CreateDataIter();
+
+  auto val_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./val.lst")
+      .SetParam("path_imgrec", "./val.rec")
+      .SetParam("data_shape", Shape(3, 224, 224))
+      .SetParam("batch_size", batch_size)
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  auto *exec = inception_bn_net.SimpleBind(Context::gpu(), args_map);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  delete exec;
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
new file mode 100644
index 000000000000..d0101a038333
--- /dev/null
+++ b/cpp-package/example/lenet.cpp
@@ -0,0 +1,236 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+class Lenet {
+ public:
+  Lenet()
+      : ctx_cpu(Context(DeviceType::kCPU, 0)),
+        ctx_dev(Context(DeviceType::kGPU, 0)) {}
+  void Run() {
+    /*
+     * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
+     * "Gradient-based learning applied to document recognition."
+     * Proceedings of the IEEE (1998)
+     * */
+
+    /*define the symbolic net*/
+    Symbol data = Symbol::Variable("data");
+    Symbol data_label = Symbol::Variable("data_label");
+    Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+    Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
+    Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
+    Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+    Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
+
+    Symbol conv1 =
+        Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
+    Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::tanh);
+    Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+    Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b,
+      Shape(5, 5), 50);
+    Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::tanh);
+    Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+    Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b,
+      Shape(2, 2), 500);
+    Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::tanh);
+    Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(1, 1));
+
+    Symbol flatten = Flatten("flatten", pool3);
+    Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
+    Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::tanh);
+    Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10);
+
+    Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
+
+    for (auto s : lenet.ListArguments()) {
+      LG << s;
+    }
+
+    /*setup basic configs*/
+    int val_fold = 1;
+    int W = 28;
+    int H = 28;
+    int batch_size = 42;
+    int max_epoch = 100000;
+    float learning_rate = 1e-4;
+    float weight_decay = 1e-4;
+
+    /*prepare the data*/
+    vector<float> data_vec, label_vec;
+    size_t data_count = GetData(&data_vec, &label_vec);
+    const float *dptr = data_vec.data();
+    const float *lptr = label_vec.data();
+    NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu,
+                                 false);  // store in main memory, and copy to
+    // device memory while training
+    NDArray label_array =
+      NDArray(Shape(data_count), ctx_cpu,
+                false);  // it's also ok if just store them all in device memory
+    data_array.SyncCopyFromCPU(dptr, data_count * W * H);
+    label_array.SyncCopyFromCPU(lptr, data_count);
+    data_array.WaitToRead();
+    label_array.WaitToRead();
+
+    size_t train_num = data_count * (1 - val_fold / 10.0);
+    train_data = data_array.Slice(0, train_num);
+    train_label = label_array.Slice(0, train_num);
+    val_data = data_array.Slice(train_num, data_count);
+    val_label = label_array.Slice(train_num, data_count);
+
+    LG << "here read fin";
+
+    /*init some of the args*/
+    // map<string, NDArray> args_map;
+    args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev);
+    args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev);
+    NDArray::WaitAll();
+
+    LG << "here slice fin";
+    /*
+     * we can also feed in some of the args other than the input all by
+     * ourselves,
+     * fc2-w , fc1-b for example:
+     * */
+    // args_map["fc2_w"] =
+    // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false);
+    // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]);
+    // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false);
+    // args_map["fc1_b"] = 0;
+
+    lenet.InferArgsMap(ctx_dev, &args_map, args_map);
+    Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+    opt->SetParam("momentum", 0.9)
+       ->SetParam("rescale_grad", 1.0)
+       ->SetParam("clip_gradient", 10);
+
+    for (int ITER = 0; ITER < max_epoch; ++ITER) {
+      size_t start_index = 0;
+      while (start_index < train_num) {
+        if (start_index + batch_size > train_num) {
+          start_index = train_num - batch_size;
+        }
+        args_map["data"] =
+            train_data.Slice(start_index, start_index + batch_size)
+                .Copy(ctx_dev);
+        args_map["data_label"] =
+            train_label.Slice(start_index, start_index + batch_size)
+                .Copy(ctx_dev);
+        start_index += batch_size;
+        NDArray::WaitAll();
+
+        Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+        exe->Forward(true);
+        exe->Backward();
+        exe->UpdateAll(opt, learning_rate, weight_decay);
+
+        delete exe;
+      }
+
+      LG << "Iter " << ITER
+         << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
+    }
+  }
+
+ private:
+  Context ctx_cpu;
+  Context ctx_dev;
+  map<string, NDArray> args_map;
+  NDArray train_data;
+  NDArray train_label;
+  NDArray val_data;
+  NDArray val_label;
+
+  size_t GetData(vector<float> *data, vector<float> *label) {
+    const char *train_data_path = "./train.csv";
+    ifstream inf(train_data_path);
+    string line;
+    inf >> line;  // ignore the header
+    size_t _N = 0;
+    while (inf >> line) {
+      for (auto &c : line) c = (c == ',') ? ' ' : c;
+      stringstream ss;
+      ss << line;
+      float _data;
+      ss >> _data;
+      label->push_back(_data);
+      while (ss >> _data) data->push_back(_data / 256.0);
+      _N++;
+    }
+    inf.close();
+    return _N;
+  }
+
+  float ValAccuracy(int batch_size, Symbol lenet) {
+    size_t val_num = val_data.GetShape()[0];
+
+    size_t correct_count = 0;
+    size_t all_count = 0;
+
+    size_t start_index = 0;
+    while (start_index < val_num) {
+      if (start_index + batch_size > val_num) {
+        start_index = val_num - batch_size;
+      }
+      args_map["data"] =
+          val_data.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
+      args_map["data_label"] =
+          val_label.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
+      start_index += batch_size;
+      NDArray::WaitAll();
+
+      Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+      exe->Forward(false);
+
+      const auto &out = exe->outputs;
+      NDArray out_cpu = out[0].Copy(ctx_cpu);
+      NDArray label_cpu =
+          val_label.Slice(start_index - batch_size, start_index).Copy(ctx_cpu);
+
+      NDArray::WaitAll();
+
+      const mx_float *dptr_out = out_cpu.GetData();
+      const mx_float *dptr_label = label_cpu.GetData();
+      for (int i = 0; i < batch_size; ++i) {
+        float label = dptr_label[i];
+        int cat_num = out_cpu.GetShape()[1];
+        float p_label = 0, max_p = dptr_out[i * cat_num];
+        for (int j = 0; j < cat_num; ++j) {
+          float p = dptr_out[i * cat_num + j];
+          if (max_p < p) {
+            p_label = j;
+            max_p = p;
+          }
+        }
+        if (label == p_label) correct_count++;
+      }
+      all_count += batch_size;
+
+      delete exe;
+    }
+    return correct_count * 1.0 / all_count;
+  }
+};
+
+int main(int argc, char const *argv[]) {
+  Lenet lenet;
+  lenet.Run();
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
new file mode 100644
index 000000000000..889b9f083fe7
--- /dev/null
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -0,0 +1,122 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+Symbol LenetSymbol() {
+  /*
+   * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
+   * "Gradient-based learning applied to document recognition."
+   * Proceedings of the IEEE (1998)
+   * */
+
+  /*define the symbolic net*/
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
+  Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
+  Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
+  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
+  Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
+
+  Symbol conv1 = Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
+  Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::tanh);
+  Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, Shape(5, 5), 50);
+  Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::tanh);
+  Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::max,
+      false, false, PoolingPoolingConvention::valid, Shape(2, 2));
+
+  Symbol flatten = Flatten("flatten", pool2);
+  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
+  Symbol tanh3 = Activation("tanh3", fc1, ActivationActType::tanh);
+  Symbol fc2 = FullyConnected("fc2", tanh3, fc2_w, fc2_b, 10);
+
+  Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
+
+  return lenet;
+}
+
+int main(int argc, char const *argv[]) {
+  /*setup basic configs*/
+  int W = 28;
+  int H = 28;
+  int batch_size = 128;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto lenet = LenetSymbol();
+  std::map<string, NDArray> args_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 1, W, H), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  lenet.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), Context::gpu());
+  NDArray::SampleGaussian(0, 1, &args_map["fc1_w"]);
+  args_map["fc2_b"] = NDArray(Shape(10), Context::gpu());
+  args_map["fc2_b"] = 0;
+
+  auto train_iter = MXDataIter("MNISTIter")
+      .SetParam("image", "./train-images-idx3-ubyte")
+      .SetParam("label", "./train-labels-idx1-ubyte")
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .SetParam("flat", 0)
+      .CreateDataIter();
+  auto val_iter = MXDataIter("MNISTIter")
+      .SetParam("image", "./t10k-images-idx3-ubyte")
+      .SetParam("label", "./t10k-labels-idx1-ubyte")
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0)
+     ->SetParam("clip_gradient", 10);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      delete exec;
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      args_map["data"] = data_batch.data.Copy(Context::gpu());
+      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      NDArray::WaitAll();
+      auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+      delete exec;
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
new file mode 100644
index 000000000000..347a196835d2
--- /dev/null
+++ b/cpp-package/example/mlp.cpp
@@ -0,0 +1,164 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ */
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace std;
+using namespace mxnet::cpp;
+
+/*
+ * In this example,
+ * we make by hand some data in 10 classes with some pattern
+ * and try to use MLP to recognize the pattern.
+ */
+
+void OutputAccuracy(mx_float* pred, mx_float* target) {
+  int right = 0;
+  for (int i = 0; i < 128; ++i) {
+    float mx_p = pred[i * 10 + 0];
+    float p_y = 0;
+    for (int j = 0; j < 10; ++j) {
+      if (pred[i * 10 + j] > mx_p) {
+        mx_p = pred[i * 10 + j];
+        p_y = j;
+      }
+    }
+    if (p_y == target[i]) right++;
+  }
+  cout << "Accuracy: " << right / 128.0 << endl;
+}
+
+void MLP() {
+  auto sym_x = Symbol::Variable("X");
+  auto sym_label = Symbol::Variable("label");
+
+  const int nLayers = 2;
+  vector<int> layerSizes({512, 10});
+  vector<Symbol> weights(nLayers);
+  vector<Symbol> biases(nLayers);
+  vector<Symbol> outputs(nLayers);
+
+  for (int i = 0; i < nLayers; i++) {
+    string istr = to_string(i);
+    weights[i] = Symbol::Variable(string("w") + istr);
+    biases[i] = Symbol::Variable(string("b") + istr);
+    Symbol fc = FullyConnected(string("fc") + istr,
+      i == 0? sym_x : outputs[i-1],
+      weights[i], biases[i], layerSizes[i]);
+    outputs[i] = LeakyReLU(string("act") + istr, fc, LeakyReLUActType::leaky);
+  }
+  auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);
+
+  Context ctx_dev(DeviceType::kCPU, 0);
+
+  NDArray array_x(Shape(128, 28), ctx_dev, false);
+  NDArray array_y(Shape(128), ctx_dev, false);
+
+  mx_float* aptr_x = new mx_float[128 * 28];
+  mx_float* aptr_y = new mx_float[128];
+
+  // we make the data by hand, in 10 classes, with some pattern
+  for (int i = 0; i < 128; i++) {
+    for (int j = 0; j < 28; j++) {
+      aptr_x[i * 28 + j] = i % 10 * 1.0f;
+    }
+    aptr_y[i] = i % 10;
+  }
+  array_x.SyncCopyFromCPU(aptr_x, 128 * 28);
+  array_x.WaitToRead();
+  array_y.SyncCopyFromCPU(aptr_y, 128);
+  array_y.WaitToRead();
+
+  // init the parameters
+  NDArray array_w_1(Shape(512, 28), ctx_dev, false);
+  NDArray array_b_1(Shape(512), ctx_dev, false);
+  NDArray array_w_2(Shape(10, 512), ctx_dev, false);
+  NDArray array_b_2(Shape(10), ctx_dev, false);
+
+  // the parameters should be initialized in some kind of distribution,
+  // so it learns fast
+  // but here just give a const value by hand
+  array_w_1 = 0.5f;
+  array_b_1 = 0.0f;
+  array_w_2 = 0.5f;
+  array_b_2 = 0.0f;
+
+  // the grads
+  NDArray array_w_1_g(Shape(512, 28), ctx_dev, false);
+  NDArray array_b_1_g(Shape(512), ctx_dev, false);
+  NDArray array_w_2_g(Shape(10, 512), ctx_dev, false);
+  NDArray array_b_2_g(Shape(10), ctx_dev, false);
+
+  // Bind the symolic network with the ndarray
+  // all the input args
+  std::vector<NDArray> in_args;
+  in_args.push_back(array_x);
+  in_args.push_back(array_w_1);
+  in_args.push_back(array_b_1);
+  in_args.push_back(array_w_2);
+  in_args.push_back(array_b_2);
+  in_args.push_back(array_y);
+  // all the grads
+  std::vector<NDArray> arg_grad_store;
+  arg_grad_store.push_back(NDArray());  // we don't need the grad of the input
+  arg_grad_store.push_back(array_w_1_g);
+  arg_grad_store.push_back(array_b_1_g);
+  arg_grad_store.push_back(array_w_2_g);
+  arg_grad_store.push_back(array_b_2_g);
+  arg_grad_store.push_back(
+      NDArray());  // neither do we need the grad of the loss
+  // how to handle the grad
+  std::vector<OpReqType> grad_req_type;
+  grad_req_type.push_back(kNullOp);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kWriteTo);
+  grad_req_type.push_back(kNullOp);
+  std::vector<NDArray> aux_states;
+
+  cout << "make the Executor" << endl;
+  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
+                               grad_req_type, aux_states);
+
+  cout << "Training" << endl;
+  int max_iters = 20000;
+  mx_float learning_rate = 0.0001;
+  for (int iter = 0; iter < max_iters; ++iter) {
+    exe->Forward(true);
+
+    if (iter % 100 == 0) {
+      cout << "epoch " << iter << endl;
+      std::vector<NDArray>& out = exe->outputs;
+      float* cptr = new float[128 * 10];
+      out[0].SyncCopyToCPU(cptr, 128 * 10);
+      NDArray::WaitAll();
+      OutputAccuracy(cptr, aptr_y);
+      delete[] cptr;
+    }
+
+    // update the parameters
+    exe->Backward();
+    for (int i = 1; i < 5; ++i) {
+      in_args[i] -= arg_grad_store[i] * learning_rate;
+    }
+    NDArray::WaitAll();
+  }
+
+  delete exe;
+  delete[] aptr_x;
+  delete[] aptr_y;
+}
+
+int main(int argc, char** argv) {
+  MLP();
+  MXNotifyShutdown();
+  return 0;
+}
+
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
new file mode 100644
index 000000000000..5d3131223ef3
--- /dev/null
+++ b/cpp-package/example/resnet.cpp
@@ -0,0 +1,196 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ */
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/MxNetCpp.h"
+// Allow IDE to parse the types
+#include "../include/mxnet-cpp/op.h"
+
+using namespace mxnet::cpp;
+
+Symbol ConvolutionNoBias(const std::string& symbol_name,
+                         Symbol data,
+                         Symbol weight,
+                         Shape kernel,
+                         int num_filter,
+                         Shape stride = Shape(1, 1),
+                         Shape dilate = Shape(1, 1),
+                         Shape pad = Shape(0, 0),
+                         int num_group = 1,
+                         int64_t workspace = 512) {
+  return Operator("Convolution")
+      .SetParam("kernel", kernel)
+      .SetParam("num_filter", num_filter)
+      .SetParam("stride", stride)
+      .SetParam("dilate", dilate)
+      .SetParam("pad", pad)
+      .SetParam("num_group", num_group)
+      .SetParam("workspace", workspace)
+      .SetParam("no_bias", true)
+      .SetInput("data", data)
+      .SetInput("weight", weight)
+      .CreateSymbol(symbol_name);
+}
+
+static const Symbol BN_BETA;
+static const Symbol BN_GAMMA;
+
+Symbol getConv(const std::string & name, Symbol data,
+               int  num_filter,
+               Shape kernel, Shape stride, Shape pad,
+               bool with_relu,
+               mx_float bn_momentum) {
+  Symbol conv_w(name + "_w");
+  Symbol conv = ConvolutionNoBias(name, data, conv_w,
+                                  kernel, num_filter, stride, Shape(1, 1),
+                                  pad, 1, 512);
+
+  Symbol bn = BatchNorm(name + "_bn", conv, BN_GAMMA, BN_BETA, 2e-5, bn_momentum, false);
+
+  if (with_relu) {
+    return Activation(name + "_relu", bn, "relu");
+  } else {
+    return bn;
+  }
+}
+
+Symbol makeBlock(const std::string & name, Symbol data, int num_filter,
+                 bool dim_match, mx_float bn_momentum) {
+  Shape stride;
+  if (dim_match) {
+    stride = Shape(1, 1);
+  } else {
+    stride = Shape(2, 2);
+  }
+
+  Symbol conv1 = getConv(name + "_conv1", data, num_filter,
+                         Shape(3, 3), stride, Shape(1, 1),
+                         true, bn_momentum);
+
+  Symbol conv2 = getConv(name + "_conv2", conv1, num_filter,
+                         Shape(3, 3), Shape(1, 1), Shape(1, 1),
+                         false, bn_momentum);
+
+  Symbol shortcut;
+
+  if (dim_match) {
+    shortcut = data;
+  } else {
+    Symbol shortcut_w(name + "_proj_w");
+    shortcut = ConvolutionNoBias(name + "_proj", data, shortcut_w,
+                                 Shape(2, 2), num_filter,
+                                 Shape(2, 2), Shape(1, 1), Shape(0, 0),
+                                 1, 512);
+  }
+
+  Symbol fused = shortcut + conv2;
+  return Activation(name + "_relu", fused, "relu");
+}
+
+Symbol getBody(Symbol data, int num_level, int num_block, int num_filter, mx_float bn_momentum) {
+  for (int level = 0; level < num_level; level++) {
+    for (int block = 0; block < num_block; block++) {
+      data = makeBlock("level" + std::to_string(level + 1) + "_block" + std::to_string(block + 1),
+                       data, num_filter * (std::pow(2, level)),
+                       (level == 0 || block > 0), bn_momentum);
+    }
+  }
+  return data;
+}
+
+Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
+                    int num_filter = 16, mx_float bn_momentum = 0.9,
+                    mxnet::cpp::Shape pool_kernel = mxnet::cpp::Shape(8, 8)) {
+  // data and label
+  Symbol data = Symbol::Variable("data");
+  Symbol data_label = Symbol::Variable("data_label");
+
+  Symbol zscore = BatchNorm("zscore", data, BN_GAMMA, BN_BETA, 0.001, bn_momentum);
+
+  Symbol conv = getConv("conv0", zscore, num_filter,
+                        Shape(3, 3), Shape(1, 1), Shape(1, 1),
+                        true, bn_momentum);
+
+  Symbol body = getBody(conv, num_level, num_block, num_filter, bn_momentum);
+
+  Symbol pool = Pooling("pool", body, pool_kernel, PoolingPoolType::avg);
+
+  Symbol flat = Flatten("flatten", pool);
+
+  Symbol fc_w("fc_w"), fc_b("fc_b");
+  Symbol fc = FullyConnected("fc", flat, fc_w, fc_b, num_class);
+
+  return SoftmaxOutput("softmax", fc, data_label);
+}
+
+int main(int argc, char const *argv[]) {
+  int batch_size = 50;
+  int max_epoch = 100;
+  float learning_rate = 1e-4;
+  float weight_decay = 1e-4;
+
+  auto resnet = ResNetSymbol(10);
+  std::map<std::string, NDArray> args_map;
+  std::map<std::string, NDArray> aux_map;
+
+  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), Context::gpu());
+  args_map["data_label"] = NDArray(Shape(batch_size), Context::gpu());
+  resnet.InferArgsMap(Context::gpu(), &args_map, args_map);
+
+  auto train_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./sf1_train.lst")
+      .SetParam("path_imgrec", "./sf1_train.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .SetParam("shuffle", 1)
+      .CreateDataIter();
+
+  auto val_iter = MXDataIter("ImageRecordIter")
+      .SetParam("path_imglist", "./sf1_val.lst")
+      .SetParam("path_imgrec", "./sf1_val.rec")
+      .SetParam("data_shape", Shape(3, 256, 256))
+      .SetParam("batch_size", batch_size)
+      .CreateDataIter();
+
+  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("momentum", 0.9)
+     ->SetParam("rescale_grad", 1.0 / batch_size)
+     ->SetParam("clip_gradient", 10);
+
+  auto *exec = resnet.SimpleBind(Context::gpu(), args_map);
+
+  for (int iter = 0; iter < max_epoch; ++iter) {
+    LG << "Epoch: " << iter;
+    train_iter.Reset();
+    while (train_iter.Next()) {
+      auto data_batch = train_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+
+      exec->Forward(true);
+      exec->Backward();
+      exec->UpdateAll(opt, learning_rate, weight_decay);
+      NDArray::WaitAll();
+    }
+
+    Accuracy acu;
+    val_iter.Reset();
+    while (val_iter.Next()) {
+      auto data_batch = val_iter.GetDataBatch();
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
+      NDArray::WaitAll();
+      exec->Forward(false);
+      NDArray::WaitAll();
+      acu.Update(data_batch.label, exec->outputs[0]);
+    }
+    LG << "Accuracy: " << acu.Get();
+  }
+  delete exec;
+  MXNotifyShutdown();
+  return 0;
+}
diff --git a/cpp-package/example/run_lenet_with_mxdataiter.sh b/cpp-package/example/run_lenet_with_mxdataiter.sh
new file mode 100755
index 000000000000..fffc355865bc
--- /dev/null
+++ b/cpp-package/example/run_lenet_with_mxdataiter.sh
@@ -0,0 +1,6 @@
+if [ ! -f "./mnist.zip" ]; then
+  wget http://webdocs.cs.ualberta.ca/~bx3/data/mnist.zip
+  unzip -u mnist.zip
+fi
+make lenet_with_mxdataiter
+LD_LIBRARY_PATH=../lib/linux ./lenet_with_mxdataiter
diff --git a/cpp-package/include/mxnet-cpp/.gitignore b/cpp-package/include/mxnet-cpp/.gitignore
new file mode 100644
index 000000000000..995efdd6f07b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/.gitignore
@@ -0,0 +1,2 @@
+# Rebuildable file(s)
+op.h
diff --git a/cpp-package/include/mxnet-cpp/CPPLINT.cfg b/cpp-package/include/mxnet-cpp/CPPLINT.cfg
new file mode 100644
index 000000000000..2f2b772b465b
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/CPPLINT.cfg
@@ -0,0 +1,2 @@
+filter=-runtime/references
+exclude_files=op.h
diff --git a/cpp-package/include/mxnet-cpp/MxNetCpp.h b/cpp-package/include/mxnet-cpp/MxNetCpp.h
new file mode 100644
index 000000000000..dc5d7750a70d
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/MxNetCpp.h
@@ -0,0 +1,23 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file MxNetCpp.h
+ * \brief meta include file for mxnet.cpp
+ * \author Chuntao Hong, Zhang Chen
+ */
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_MXNETCPP_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_MXNETCPP_H_
+
+#include "mxnet-cpp/executor.hpp"
+#include "mxnet-cpp/symbol.hpp"
+#include "mxnet-cpp/ndarray.hpp"
+#include "mxnet-cpp/operator.hpp"
+#include "mxnet-cpp/optimizer.hpp"
+#include "mxnet-cpp/kvstore.hpp"
+#include "mxnet-cpp/op.h"
+#include "mxnet-cpp/op_suppl.h"
+#include "mxnet-cpp/io.hpp"
+#include "mxnet-cpp/metric.h"
+#include "mxnet-cpp/initializer.h"
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_MXNETCPP_H_
diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
new file mode 100644
index 000000000000..18f268a8a85a
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -0,0 +1,38 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file base.h
+* \brief base definitions for mxnetcpp
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_BASE_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_BASE_H_
+
+#include <cstdlib>
+#include "mxnet/c_api.h"
+#include "nnvm/c_api.h"
+
+namespace mxnet {
+namespace cpp {
+
+typedef unsigned index_t;
+
+enum OpReqType {
+  /*! \brief no operation, do not write anything */
+  kNullOp,
+  /*! \brief write gradient to provided space */
+  kWriteTo,
+  /*!
+  * \brief perform an inplace write,
+  * Target shares memory with one of input arguments.
+  * This option only happen when
+  */
+  kWriteInplace,
+  /*! \brief add to the provided space */
+  kAddTo
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_BASE_H_
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
new file mode 100644
index 000000000000..9b358f15fde7
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -0,0 +1,137 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file executor.h
+* \brief executor definition
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_EXECUTOR_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_EXECUTOR_H_
+
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Optimizer;
+
+/*!
+* \brief Executor interface
+*/
+class Executor {
+ public:
+  Executor(const Symbol &symbol, Context context,
+           const std::vector<NDArray> &arg_arrays,
+           const std::vector<NDArray> &grad_arrays,
+           const std::vector<OpReqType> &grad_reqs,
+           const std::vector<NDArray> &aux_arrays,
+           const std::map<std::string, Context> &group_to_ctx =
+               std::map<std::string, Context>(),
+           Executor *shared_exec = nullptr);
+  explicit Executor(const ExecutorHandle &h) { handle_ = h; }
+  /*!
+  * \brief Perform a Forward operation of Operator
+  *  After this operation, user can get the result by using function head.
+  */
+  void Forward(bool is_train) {
+    MXExecutorForward(handle_, is_train ? 1 : 0);
+    mx_uint out_size;
+    NDArrayHandle *out_array;
+    CHECK_EQ(MXExecutorOutputs(handle_, &out_size, &out_array), 0);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      outputs[i] = NDArray(out_array[i]);
+    }
+  }
+  /*!
+  * \brief Perform a Backward operation of the Operator.
+  *  This must be called after Forward.
+  *  After this operation, NDArrays specified by grad_in_args_store will be
+  *updated accordingly.
+  *  User is allowed to pass in an empty Array if the head node is
+  *  loss function and head gradeitn is not needed.
+  *
+  * \param head_grads the gradient of head nodes to be backproped.
+  */
+  void Backward(const std::vector<NDArray> &head_grads =
+                    std::vector<NDArray>()) {
+    std::vector<NDArrayHandle> head_grads_;
+    for (auto d : head_grads) {
+      head_grads_.push_back(d.GetHandle());
+    }
+    if (head_grads_.size() > 0) {
+      MXExecutorBackward(handle_, head_grads_.size(), head_grads_.data());
+    } else {
+      MXExecutorBackward(handle_, 0, nullptr);
+    }
+  }
+  // TODO(zhangchen-qinyinghua)
+  // To implement reshape function
+  void Reshape();
+  /*!
+  * \brief update the arguments with given learning rate and optimizer
+  * \return the SymbolHandle
+  */
+  std::string DebugStr();
+  /*!
+  * \brief update the arguments with given learning rate and optimizer
+  * \param opt the pointer to the optimizer
+  * \param lr learning rate
+  * \param wd weight decay
+  * \param arg_update_begin begin index of the arguments to be updated, it
+  * starts after the input data by default
+  * \param arg_update_end end index of the arguments to be updated, it ends
+  * before the label data by default
+  */
+  void UpdateAll(Optimizer *opt, float lr, float wd, int arg_update_begin = 1,
+                 int arg_update_end = -1);
+  /*!
+  * \brief destructor, free the handle
+  */
+  ~Executor() { MXExecutorFree(handle_); }
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<NDArray> aux_arrays;
+  /*!
+  * \brief arrays store the outputs of forward
+  */
+  std::vector<NDArray> outputs;
+  std::map<std::string, NDArray> arg_dict() {
+    return GetDict(symbol_.ListArguments(), arg_arrays);
+  }
+  std::map<std::string, NDArray> grad_dict() {
+    return GetDict(symbol_.ListArguments(), grad_arrays);
+  }
+  std::map<std::string, NDArray> aux_dict() {
+    return GetDict(symbol_.ListAuxiliaryStates(), aux_arrays);
+  }
+
+ private:
+  Executor(const Executor &e);
+  Executor &operator=(const Executor &e);
+  ExecutorHandle handle_;
+  Symbol symbol_;
+  std::map<std::string, NDArray> GetDict(const std::vector<std::string> &names,
+                                         const std::vector<NDArray> &arrays) {
+    std::map<std::string, NDArray> ret;
+    std::set<std::string> name_set;
+    for (const auto &s : names) {
+      CHECK(name_set.find(s) == name_set.end()) << "Duplicate names detected, "
+                                                << s;
+      name_set.insert(s);
+    }
+    CHECK_EQ(name_set.size(), arrays.size())
+        << "names size not equal to arrays size";
+    for (size_t i = 0; i < names.size(); ++i) {
+      ret[names[i]] = arrays[i];
+    }
+    return ret;
+  }
+};
+}  // namespace cpp
+}  // namespace mxnet
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_EXECUTOR_H_
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
new file mode 100644
index 000000000000..4cae684f8881
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -0,0 +1,92 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file executor.hpp
+ * \brief implementation of the executor
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_EXECUTOR_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_EXECUTOR_HPP_
+
+#include <vector>
+#include <map>
+#include <string>
+#include "mxnet-cpp/executor.h"
+#include "mxnet-cpp/optimizer.h"
+
+namespace mxnet {
+namespace cpp {
+inline Executor::Executor(const Symbol &symbol, Context context,
+                          const std::vector<NDArray> &arg_arrays,
+                          const std::vector<NDArray> &grad_arrays,
+                          const std::vector<OpReqType> &grad_reqs,
+                          const std::vector<NDArray> &aux_arrays,
+                          const std::map<std::string, Context> &group_to_ctx,
+                          Executor *shared_exec) {
+  this->arg_arrays = arg_arrays;
+  this->grad_arrays = grad_arrays;
+  this->aux_arrays = aux_arrays;
+  this->symbol_ = symbol;
+
+  std::vector<NDArrayHandle> arg_handles;
+  std::vector<NDArrayHandle> grad_handles;
+  std::vector<NDArrayHandle> aux_handles;
+
+  for (const auto &array : arg_arrays) {
+    arg_handles.push_back(array.GetHandle());
+  }
+  for (const auto &array : grad_arrays) {
+    grad_handles.push_back(array.GetHandle());
+  }
+  for (const auto &array : aux_arrays) {
+    aux_handles.push_back(array.GetHandle());
+  }
+
+  std::vector<mx_uint> grad_reqs_uint;
+  for (auto s : grad_reqs) grad_reqs_uint.push_back(s);
+
+  std::vector<const char *> map_keys;
+  std::vector<int> dev_types, dev_ids;
+  for (const auto &s : group_to_ctx) {
+    map_keys.push_back(s.first.c_str());
+    dev_types.push_back(s.second.GetDeviceType());
+    dev_ids.push_back(s.second.GetDeviceId());
+  }
+
+  ExecutorHandle *shared_exec_handle =
+      shared_exec == nullptr ? nullptr : &shared_exec->handle_;
+
+  CHECK_EQ(MXExecutorBindEX(symbol.GetHandle(), context.GetDeviceType(),
+                            context.GetDeviceId(), group_to_ctx.size(),
+                            map_keys.data(), dev_types.data(), dev_ids.data(),
+                            arg_handles.size(), arg_handles.data(),
+                            grad_handles.data(), grad_reqs_uint.data(),
+                            aux_handles.size(), aux_handles.data(),
+                            shared_exec_handle, &handle_),
+           0);
+
+  mx_uint out_size;
+  NDArrayHandle *out_array;
+  CHECK_EQ(MXExecutorOutputs(handle_, &out_size, &out_array), 0);
+  for (mx_uint i = 0; i < out_size; ++i) {
+    outputs.push_back(NDArray(out_array[i]));
+  }
+}
+
+inline std::string Executor::DebugStr() {
+  const char *output;
+  MXExecutorPrint(handle_, &output);
+  return std::string(output);
+}
+
+inline void Executor::UpdateAll(Optimizer *opt, float lr, float wd,
+                                int arg_update_begin, int arg_update_end) {
+  arg_update_end = arg_update_end < 0 ? arg_arrays.size() - 1 : arg_update_end;
+  for (int i = arg_update_begin; i < arg_update_end; ++i) {
+    opt->Update(i, arg_arrays[i], grad_arrays[i], lr, wd);
+  }
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_EXECUTOR_HPP_
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
new file mode 100644
index 000000000000..cdcc1a8a8fc6
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -0,0 +1,130 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file initializer.h
+ * \brief random initializer
+ * \author Zhang Chen
+ */
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_INITIALIZER_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_INITIALIZER_H_
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Initializer {
+ public:
+  static bool StringStartWith(const std::string& name,
+                              const std::string& check_str) {
+    return (name.size() >= check_str.size() &&
+            name.substr(0, check_str.size()) == check_str);
+  }
+  static bool StringEndWith(const std::string& name,
+                            const std::string& check_str) {
+    return (name.size() >= check_str.size() &&
+            name.substr(name.size() - check_str.size(), check_str.size()) ==
+                check_str);
+  }
+  virtual void operator()(const std::string& name, NDArray* arr) {
+    if (StringStartWith(name, "upsampling")) {
+      InitBilinear(arr);
+    } else if (StringEndWith(name, "bias")) {
+      InitBias(arr);
+    } else if (StringEndWith(name, "gamma")) {
+      InitGamma(arr);
+    } else if (StringEndWith(name, "beta")) {
+      InitBeta(arr);
+    } else if (StringEndWith(name, "weight")) {
+      InitWeight(arr);
+    } else if (StringEndWith(name, "moving_mean")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "moving_var")) {
+      InitOne(arr);
+    } else if (StringEndWith(name, "moving_inv_var")) {
+      InitZero(arr);
+    } else if (StringEndWith(name, "moving_avg")) {
+      InitZero(arr);
+    } else {
+      InitDefault(arr);
+    }
+  }
+
+ protected:
+  virtual void InitBilinear(NDArray* arr) {
+    Shape shape(arr->GetShape());
+    std::vector<float> weight(shape.Size(), 0);
+    int f = std::ceil(shape[3] / 2.0);
+    float c = (2 * f - 1 - f % 2) / (2. * f);
+    for (size_t i = 0; i < shape.Size(); ++i) {
+      int x = i % shape[3];
+      int y = (i / shape[3]) % shape[2];
+      weight[i] = (1 - std::abs(x / f - c)) * (1 - std::abs(y / f - c));
+    }
+    (*arr).SyncCopyFromCPU(weight);
+  }
+  virtual void InitZero(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitOne(NDArray* arr) { (*arr) = 1.0f; }
+  virtual void InitBias(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitGamma(NDArray* arr) { (*arr) = 1.0f; }
+  virtual void InitBeta(NDArray* arr) { (*arr) = 0.0f; }
+  virtual void InitWeight(NDArray* arr) {}
+  virtual void InitDefault(NDArray* arr) {}
+};
+
+class Xavier : public Initializer {
+ public:
+  enum RandType {
+    gaussian,
+    uniform
+  } rand_type;
+  enum FactorType {
+    avg,
+    in,
+    out
+  } factor_type;
+  float magnitude;
+  Xavier(RandType rand_type = gaussian, FactorType factor_type = avg,
+         float magnitude = 3)
+      : rand_type(rand_type), factor_type(factor_type), magnitude(magnitude) {}
+
+ protected:
+  virtual void InitWeight(NDArray* arr) {
+    Shape shape(arr->GetShape());
+    float hw_scale = 1.0f;
+    if (shape.ndim() > 2) {
+      for (size_t i = 2; i < shape.ndim(); ++i) {
+        hw_scale *= shape[i];
+      }
+    }
+    float fan_in = shape[1] * hw_scale, fan_out = shape[0] * hw_scale;
+    float factor = 1.0f;
+    switch (factor_type) {
+      case avg:
+        factor = (fan_in + fan_out) / 2.0;
+        break;
+      case in:
+        factor = fan_in;
+        break;
+      case out:
+        factor = fan_out;
+    }
+    float scale = std::sqrt(magnitude / factor);
+    switch (rand_type) {
+      case uniform:
+        NDArray::SampleUniform(-scale, scale, arr);
+        break;
+      case gaussian:
+        NDArray::SampleGaussian(0, scale, arr);
+        break;
+    }
+  }
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_INITIALIZER_H_
diff --git a/cpp-package/include/mxnet-cpp/io.h b/cpp-package/include/mxnet-cpp/io.h
new file mode 100644
index 000000000000..171803831109
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/io.h
@@ -0,0 +1,128 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.h
+* \brief definition of io, such as DataIter
+* \author Zhang Chen
+*/
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_IO_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_IO_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include <sstream>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/ndarray.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+/*!
+* \brief Default object for holding a mini-batch of data and related
+* information.
+*/
+class DataBatch {
+ public:
+  NDArray data;
+  NDArray label;
+  int pad_num;
+  std::vector<int> index;
+};
+class DataIter {
+ public:
+  virtual void BeforeFirst(void) = 0;
+  virtual bool Next(void) = 0;
+  virtual NDArray GetData(void) = 0;
+  virtual NDArray GetLabel(void) = 0;
+  virtual int GetPadNum(void) = 0;
+  virtual std::vector<int> GetIndex(void) = 0;
+
+  DataBatch GetDataBatch() {
+    return DataBatch{GetData(), GetLabel(), GetPadNum(), GetIndex()};
+  }
+  void Reset() { BeforeFirst(); }
+};
+
+class MXDataIterMap {
+ public:
+  inline MXDataIterMap() {
+    mx_uint num_data_iter_creators = 0;
+    DataIterCreator *data_iter_creators = nullptr;
+    int r = MXListDataIters(&num_data_iter_creators, &data_iter_creators);
+    CHECK_EQ(r, 0);
+    for (mx_uint i = 0; i < num_data_iter_creators; i++) {
+      const char *name;
+      const char *description;
+      mx_uint num_args;
+      const char **arg_names;
+      const char **arg_type_infos;
+      const char **arg_descriptions;
+      r = MXDataIterGetIterInfo(data_iter_creators[i], &name, &description,
+                                &num_args, &arg_names, &arg_type_infos,
+                                &arg_descriptions);
+      CHECK_EQ(r, 0);
+      mxdataiter_creators_[name] = data_iter_creators[i];
+    }
+  }
+  inline DataIterCreator GetMXDataIterCreator(const std::string &name) {
+    return mxdataiter_creators_[name];
+  }
+
+ private:
+  std::map<std::string, DataIterCreator> mxdataiter_creators_;
+};
+
+struct MXDataIterBlob {
+ public:
+  MXDataIterBlob() : handle_(nullptr) {}
+  explicit MXDataIterBlob(DataIterHandle handle) : handle_(handle) {}
+  ~MXDataIterBlob() { MXDataIterFree(handle_); }
+  DataIterHandle handle_;
+
+ private:
+  MXDataIterBlob &operator=(const MXDataIterBlob &);
+};
+
+class MXDataIter : public DataIter {
+ public:
+  explicit MXDataIter(const std::string &mxdataiter_type);
+  MXDataIter(const MXDataIter &other) {
+    creator_ = other.creator_;
+    params_ = other.params_;
+    blob_ptr_ = other.blob_ptr_;
+  }
+  void BeforeFirst();
+  bool Next();
+  NDArray GetData();
+  NDArray GetLabel();
+  int GetPadNum();
+  std::vector<int> GetIndex();
+  MXDataIter CreateDataIter();
+  /*!
+   * \brief set config parameters
+   * \param name name of the config parameter
+   * \param value value of the config parameter
+   * \return reference of self
+   */
+  template <typename T>
+  MXDataIter &SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return *this;
+  }
+
+ private:
+  DataIterCreator creator_;
+  std::map<std::string, std::string> params_;
+  std::shared_ptr<MXDataIterBlob> blob_ptr_;
+  static MXDataIterMap*& mxdataiter_map();
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_IO_H_
+
diff --git a/cpp-package/include/mxnet-cpp/io.hpp b/cpp-package/include/mxnet-cpp/io.hpp
new file mode 100644
index 000000000000..61e575e949a9
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/io.hpp
@@ -0,0 +1,90 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.hpp
+* \brief implementation of data iter
+* \author Zhang Chen
+*/
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_IO_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_IO_HPP_
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/io.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline MXDataIterMap*& MXDataIter::mxdataiter_map() {
+    static MXDataIterMap* mxdataiter_map_ = new MXDataIterMap;
+    return mxdataiter_map_;
+}
+
+inline MXDataIter::MXDataIter(const std::string &mxdataiter_type) {
+  creator_ = mxdataiter_map()->GetMXDataIterCreator(mxdataiter_type);
+  blob_ptr_ = std::make_shared<MXDataIterBlob>(nullptr);
+}
+
+inline void MXDataIter::BeforeFirst() {
+  int r = MXDataIterBeforeFirst(blob_ptr_->handle_);
+  CHECK_EQ(r, 0);
+}
+
+inline bool MXDataIter::Next() {
+  int out;
+  int r = MXDataIterNext(blob_ptr_->handle_, &out);
+  CHECK_EQ(r, 0);
+  return out;
+}
+
+inline NDArray MXDataIter::GetData() {
+  NDArrayHandle handle;
+  int r = MXDataIterGetData(blob_ptr_->handle_, &handle);
+  CHECK_EQ(r, 0);
+  return NDArray(handle);
+}
+
+inline NDArray MXDataIter::GetLabel() {
+  NDArrayHandle handle;
+  int r = MXDataIterGetLabel(blob_ptr_->handle_, &handle);
+  CHECK_EQ(r, 0);
+  return NDArray(handle);
+}
+
+inline int MXDataIter::GetPadNum() {
+  int out;
+  int r = MXDataIterGetPadNum(blob_ptr_->handle_, &out);
+  CHECK_EQ(r, 0);
+  return out;
+}
+inline std::vector<int> MXDataIter::GetIndex() {
+  uint64_t *out_index, out_size;
+  int r = MXDataIterGetIndex(blob_ptr_->handle_, &out_index, &out_size);
+  CHECK_EQ(r, 0);
+  std::vector<int> ret;
+  for (uint64_t i = 0; i < out_size; ++i) {
+    ret.push_back(out_index[i]);
+  }
+  return ret;
+}
+
+inline MXDataIter MXDataIter::CreateDataIter() {
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+
+  MXDataIterCreateIter(creator_, param_keys.size(), param_keys.data(),
+                       param_values.data(), &blob_ptr_->handle_);
+  return *this;
+}
+
+// MXDataIter MNIst
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_IO_HPP_
+
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
new file mode 100644
index 000000000000..6d3987ecf030
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -0,0 +1,49 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file kvstore.h
+* \brief definition of kvstore
+* \author Chuntao Hong
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_KVSTORE_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_KVSTORE_H_
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+class KVStore {
+ public:
+  static void SetType(const std::string& type);
+  static void RunServer();
+  static void Init(int key, const NDArray& val);
+  static void Init(const std::vector<int>& keys, const std::vector<NDArray>& vals);
+  static void Push(int key, const NDArray& val, int priority = 0);
+  static void Push(const std::vector<int>& keys,
+      const std::vector<NDArray>& vals, int priority = 0);
+  static void Pull(int key, NDArray* out, int priority = 0);
+  static void Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority = 0);
+  // TODO(lx): put lr in optimizer or not?
+  static void SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local = false);
+  static std::string GetType();
+  static int GetRank();
+  static int GetNumWorkers();
+  static void Barrier();
+  static std::string GetRole();
+
+ private:
+  KVStore();
+  static KVStoreHandle& get_handle();
+  static std::unique_ptr<Optimizer>& get_optimizer();
+  static KVStore*& get_kvstore();
+  static void Controller(int head, const char* body, void* controller_handle);
+  static void Updater(int key, NDArrayHandle recv, NDArrayHandle local, void* handle_);
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_KVSTORE_H_
diff --git a/cpp-package/include/mxnet-cpp/kvstore.hpp b/cpp-package/include/mxnet-cpp/kvstore.hpp
new file mode 100644
index 000000000000..d9effcf82f3c
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/kvstore.hpp
@@ -0,0 +1,178 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file kvstore.hpp
+ * \brief implementation of kvstore
+ * \author Xin Li
+ */
+
+#include <algorithm>
+#include <map>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "mxnet-cpp/kvstore.h"
+#include "mxnet-cpp/optimizer.h"
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_KVSTORE_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_KVSTORE_HPP_
+
+namespace mxnet {
+namespace cpp {
+
+inline void KVStore::Controller(int head, const char* body, void* controller_handle) {
+  if (head == 0) {
+    std::map<std::string, std::string> params;
+    std::istringstream sin(body);
+    std::string line;
+    while (getline(sin, line)) {
+      size_t n = line.find('=');
+      params.emplace(line.substr(0, n), line.substr(n+1));
+    }
+    std::unique_ptr<Optimizer> opt(OptimizerRegistry::Find(params.at("opt_type")));
+    params.erase("opt_type");
+    for (const auto& pair : params) {
+      opt->SetParam(pair.first, pair.second);
+    }
+    get_kvstore()->SetOptimizer(std::move(opt), true);
+  }
+}
+
+inline KVStoreHandle& KVStore::get_handle() {
+  static KVStoreHandle handle_ = nullptr;
+  return handle_;
+}
+
+inline std::unique_ptr<Optimizer>& KVStore::get_optimizer() {
+  static std::unique_ptr<Optimizer> optimizer_;
+  return optimizer_;
+}
+
+inline KVStore*& KVStore::get_kvstore() {
+  static KVStore* kvstore_ = new KVStore;
+  return kvstore_;
+}
+
+inline KVStore::KVStore() {}
+
+inline void KVStore::SetType(const std::string& type) {
+  CHECK_EQ(MXKVStoreCreate(type.c_str(), &(get_kvstore()->get_handle())), 0);
+}
+
+inline void KVStore::RunServer() {
+  CHECK_NE(GetRole(), "worker");
+  CHECK_EQ(MXKVStoreRunServer(get_kvstore()->get_handle(), &Controller, 0), 0);
+}
+
+inline void KVStore::Init(int key, const NDArray& val) {
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), 1, &key, &val_handle), 0);
+}
+
+inline void KVStore::Init(const std::vector<int>& keys, const std::vector<NDArray>& vals) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStoreInit(get_kvstore()->get_handle(), keys.size(), keys.data(),
+      val_handles.data()), 0);
+}
+
+inline void KVStore::Push(int key, const NDArray& val, int priority) {
+  NDArrayHandle val_handle = val.GetHandle();
+  CHECK_EQ(MXKVStorePush(get_kvstore()->get_handle(), 1, &key, &val_handle, priority), 0);
+}
+
+inline void KVStore::Push(const std::vector<int>& keys,
+                          const std::vector<NDArray>& vals,
+                          int priority) {
+  CHECK_EQ(keys.size(), vals.size());
+  std::vector<NDArrayHandle> val_handles(vals.size());
+  std::transform(vals.cbegin(), vals.cend(), val_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePush(get_kvstore()->get_handle(), keys.size(), keys.data(),
+      val_handles.data(), priority), 0);
+}
+
+inline void KVStore::Pull(int key, NDArray* out, int priority) {
+  NDArrayHandle out_handle = out->GetHandle();
+  CHECK_EQ(MXKVStorePull(get_kvstore()->get_handle(), 1, &key, &out_handle, priority), 0);
+}
+
+inline void KVStore::Pull(const std::vector<int>& keys, std::vector<NDArray>* outs, int priority) {
+  CHECK_EQ(keys.size(), outs->size());
+
+  std::vector<NDArrayHandle> out_handles(keys.size());
+  std::transform(outs->cbegin(), outs->cend(), out_handles.begin(),
+      [](const NDArray& val) {
+        return val.GetHandle();
+      });
+
+  CHECK_EQ(MXKVStorePull(get_kvstore()->get_handle(), keys.size(), keys.data(),
+      out_handles.data(), priority), 0);
+}
+
+inline void KVStore::Updater(int key, NDArrayHandle recv, NDArrayHandle local,
+                             void* handle_) {
+  Optimizer *opt = static_cast<Optimizer*>(handle_);
+  opt->Update(key, NDArray(local), NDArray(recv));
+}
+
+inline void KVStore::SetOptimizer(std::unique_ptr<Optimizer> optimizer, bool local) {
+  if (local) {
+    get_kvstore()->get_optimizer() = std::move(optimizer);
+    CHECK_EQ(MXKVStoreSetUpdater(get_kvstore()->get_handle(),
+                                 &Updater, get_kvstore()->get_optimizer().get()), 0);
+  } else {
+    CHECK_EQ(MXKVStoreSendCommmandToServers(get_kvstore()->get_handle(), 0,
+                                            (*optimizer).Serialize().c_str()), 0);
+  }
+}
+
+inline std::string KVStore::GetType() {
+  const char *type;
+  CHECK_EQ(MXKVStoreGetType(get_kvstore()->get_handle(), &type), 0);
+  return type;
+}
+
+inline int KVStore::GetRank() {
+  int rank;
+  CHECK_EQ(MXKVStoreGetRank(get_kvstore()->get_handle(), &rank), 0);
+  return rank;
+}
+
+inline int KVStore::GetNumWorkers() {
+  int num_workers;
+  CHECK_EQ(MXKVStoreGetGroupSize(get_kvstore()->get_handle(), &num_workers), 0);
+  return num_workers;
+}
+
+inline void KVStore::Barrier() {
+  CHECK_EQ(MXKVStoreBarrier(get_kvstore()->get_handle()), 0);
+}
+
+inline std::string KVStore::GetRole() {
+  int ret;
+  CHECK_EQ(MXKVStoreIsSchedulerNode(&ret), 0);
+  if (ret) {
+    return "scheduler";
+  }
+  CHECK_EQ(MXKVStoreIsServerNode(&ret), 0);
+  if (ret) {
+    return "server";
+  }
+  CHECK_EQ(MXKVStoreIsWorkerNode(&ret), 0);
+  CHECK(ret);
+  return "worker";
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_KVSTORE_HPP_
diff --git a/cpp-package/include/mxnet-cpp/metric.h b/cpp-package/include/mxnet-cpp/metric.h
new file mode 100644
index 000000000000..da4b7fc42716
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/metric.h
@@ -0,0 +1,91 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file base.h
+* \brief metrics defined
+* \author Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_METRIC_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_METRIC_H_
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include "mxnet-cpp/ndarray.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+class EvalMetric {
+ public:
+  explicit EvalMetric(const std::string& name, int num = 0)
+      : name(name), num(num) {}
+  virtual void Update(NDArray labels, NDArray preds) = 0;
+  void Reset() {
+    num_inst = 0;
+    sum_metric = 0.0f;
+  }
+  float Get() { return sum_metric / num_inst; }
+  void GetNameValue();
+
+ protected:
+  std::string name;
+  int num;
+  float sum_metric = 0.0f;
+  int num_inst = 0;
+
+  static bool CheckLabelShapes(NDArray labels, NDArray preds,
+                               Shape shape = Shape(0)) {
+    // TODO(zhangchen-qinyinghua)
+    // inplement this
+    return true;
+  }
+};
+
+class Accuracy : public EvalMetric {
+ public:
+  Accuracy() : EvalMetric("accuracy") {}
+
+  void Update(NDArray labels, NDArray preds) {
+    CHECK_EQ(labels.GetShape().size(), 1);
+    mx_uint len = labels.GetShape()[0];
+    std::vector<mx_float> pred_data(len);
+    std::vector<mx_float> label_data(len);
+    preds.ArgmaxChannel().SyncCopyToCPU(&pred_data, len);
+    labels.SyncCopyToCPU(&label_data, len);
+    NDArray::WaitAll();
+    for (mx_uint i = 0; i < len; ++i) {
+      sum_metric += (pred_data[i] == label_data[i]) ? 1 : 0;
+      num_inst += 1;
+    }
+  }
+};
+
+class LogLoss : public EvalMetric {
+ public:
+  LogLoss() : EvalMetric("logloss") {}
+
+  void Update(NDArray labels, NDArray preds) {
+    static const float epsilon = 1e-15;
+    mx_uint len = labels.GetShape()[0];
+    mx_uint m = preds.GetShape()[1];
+    std::vector<mx_float> pred_data(len * m);
+    std::vector<mx_float> label_data(len);
+    preds.SyncCopyToCPU(&pred_data, pred_data.size());
+    labels.SyncCopyToCPU(&label_data, len);
+    NDArray::WaitAll();
+    for (mx_uint i = 0; i < len; ++i) {
+      sum_metric +=
+          -std::log(std::max(pred_data[i * m + label_data[i]], epsilon));
+      num_inst += 1;
+    }
+  }
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_METRIC_H_
+
diff --git a/cpp-package/include/mxnet-cpp/model.h b/cpp-package/include/mxnet-cpp/model.h
new file mode 100644
index 000000000000..7bfe1980f095
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/model.h
@@ -0,0 +1,58 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file model.h
+* \brief MXNET.cpp model module
+* \author Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_MODEL_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_MODEL_H_
+
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/symbol.h"
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+struct FeedForwardConfig {
+  Symbol symbol;
+  std::vector<Context> ctx = {Context::cpu()};
+  int num_epoch = 0;
+  int epoch_size = 0;
+  std::string optimizer = "sgd";
+  // TODO(zhangchen-qinyinghua) More implement
+  // initializer=Uniform(0.01),
+  // numpy_batch_size=128,
+  // arg_params=None, aux_params=None,
+  // allow_extra_params=False,
+  // begin_epoch=0,
+  // **kwargs):
+  FeedForwardConfig(const FeedForwardConfig &other) {}
+  FeedForwardConfig() {}
+};
+class FeedForward {
+ public:
+  explicit FeedForward(const FeedForwardConfig &conf) : conf_(conf) {}
+  void Predict();
+  void Score();
+  void Fit();
+  void Save();
+  void Load();
+  static FeedForward Create();
+
+ private:
+  void InitParams();
+  void InitPredictor();
+  void InitIter();
+  void InitEvalIter();
+  FeedForwardConfig conf_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_MODEL_H_
+
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
new file mode 100644
index 000000000000..b03a1f3809d1
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -0,0 +1,411 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file ndarray.h
+* \brief definition of ndarray
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_NDARRAY_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_NDARRAY_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+
+namespace mxnet {
+namespace cpp {
+
+enum DeviceType {
+  kCPU = 1,
+  kGPU = 2,
+  kCPUPinned = 3
+};
+
+/*!
+* \brief Context interface
+*/
+class Context {
+ public:
+  /*!
+  * \brief Context constructor
+  * \param type type of the device
+  * \param id id of the device
+  */
+  Context(const DeviceType &type, int id) : type_(type), id_(id) {}
+  /*!
+  * \return the type of the device
+  */
+  DeviceType GetDeviceType() const { return type_; }
+  /*!
+  * \return the id of the device
+  */
+  int GetDeviceId() const { return id_; }
+
+  /*!
+   * \brief Return a GPU context
+   * \param device_id id of the device
+   * \return the corresponding GPU context
+   */
+  static Context gpu(int device_id = 0) {
+    return Context(DeviceType::kGPU, device_id);
+  }
+
+  /*!
+   * \brief Return a CPU context
+   * \param device_id id of the device. this is not needed by CPU
+   * \return the corresponding CPU context
+   */
+  static Context cpu(int device_id = 0) {
+    return Context(DeviceType::kCPU, device_id);
+  }
+
+ private:
+  DeviceType type_;
+  int id_;
+};
+
+/*!
+* \brief struct to store NDArrayHandle
+*/
+struct NDBlob {
+ public:
+  /*!
+  * \brief default constructor
+  */
+  NDBlob() : handle_(nullptr) {}
+  /*!
+  * \brief construct with a NDArrayHandle
+  * \param handle NDArrayHandle to store
+  */
+  explicit NDBlob(NDArrayHandle handle) : handle_(handle) {}
+  /*!
+  * \brief destructor, free the NDArrayHandle
+  */
+  ~NDBlob() { MXNDArrayFree(handle_); }
+  /*!
+  * \brief the NDArrayHandle
+  */
+  NDArrayHandle handle_;
+
+ private:
+  NDBlob(const NDBlob &);
+  NDBlob &operator=(const NDBlob &);
+};
+
+/*!
+* \brief NDArray interface
+*/
+class NDArray {
+ public:
+  /*!
+  * \brief construct with a none handle
+  */
+  NDArray();
+  /*!
+  * \brief construct with a NDArrayHandle
+  */
+  explicit NDArray(const NDArrayHandle &handle);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  * \param delay_alloc whether delay the allocation
+  */
+  NDArray(const std::vector<mx_uint> &shape, const Context &context,
+          bool delay_alloc = true);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  * \param delay_alloc whether delay the allocation
+  */
+  NDArray(const Shape &shape, const Context &context, bool delay_alloc = true);
+  NDArray(const mx_float *data, size_t size);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param data the data to create NDArray from
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  */
+  NDArray(const mx_float *data, const Shape &shape, const Context &context);
+  /*!
+  * \brief construct a new dynamic NDArray
+  * \param data the data to create NDArray from
+  * \param shape the shape of array
+  * \param constext context of NDArray
+  */
+  NDArray(const std::vector<mx_float> &data, const Shape &shape,
+          const Context &context);
+  explicit NDArray(const std::vector<mx_float> &data);
+  NDArray operator+(mx_float scalar);
+  NDArray operator-(mx_float scalar);
+  NDArray operator*(mx_float scalar);
+  NDArray operator/(mx_float scalar);
+  NDArray operator+(const NDArray &);
+  NDArray operator-(const NDArray &);
+  NDArray operator*(const NDArray &);
+  NDArray operator/(const NDArray &);
+  /*!
+  * \brief set all the elements in ndarray to be scalar
+  * \param scalar the scalar to set
+  * \return reference of self
+  */
+  NDArray &operator=(mx_float scalar);
+  /*!
+  * \brief elementwise add to current space
+  *  this mutate the current NDArray
+  * \param scalar the data to add
+  * \return reference of self
+  */
+  NDArray &operator+=(mx_float scalar);
+  /*!
+  * \brief elementwise subtract from current ndarray
+  * this mutate the current NDArray
+  * \param scalar the data to substract
+  * \return reference of self
+  */
+  NDArray &operator-=(mx_float scalar);
+  /*!
+  * \brief elementwise multiplication to current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to substract
+  * \return reference of self
+  */
+  NDArray &operator*=(mx_float scalar);
+  /*!
+  * \brief elementwise division from current ndarray
+  *  this mutate the current NDArray
+  * \param scalar the data to substract
+  * \return reference of self
+  */
+  NDArray &operator/=(mx_float scalar);
+  /*!
+  * \brief elementwise add to current space
+  *  this mutate the current NDArray
+  * \param src the data to add
+  * \return reference of self
+  */
+  NDArray &operator+=(const NDArray &src);
+  /*!
+  * \brief elementwise subtract from current ndarray
+  * this mutate the current NDArray
+  * \param src the data to substract
+  * \return reference of self
+  */
+  NDArray &operator-=(const NDArray &src);
+  /*!
+  * \brief elementwise multiplication to current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to substract
+  * \return reference of self
+  */
+  NDArray &operator*=(const NDArray &src);
+  /*!
+  * \brief elementwise division from current ndarray
+  *  this mutate the current NDArray
+  * \param src the data to substract
+  * \return reference of self
+  */
+  NDArray &operator/=(const NDArray &src);
+  NDArray ArgmaxChannel();
+  /*!
+  * \brief Do a synchronize copy from a continugous CPU memory region.
+  *
+  *  This function will call WaitToWrite before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copy from.
+  * \param size the memory size we want to copy from.
+  */
+  void SyncCopyFromCPU(const mx_float *data, size_t size);
+  /*!
+  * \brief Do a synchronize copy from a continugous CPU memory region.
+  *
+  *  This function will call WaitToWrite before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copy from, int the form of mx_float vector
+  */
+  void SyncCopyFromCPU(const std::vector<mx_float> &data);
+  /*!
+  * \brief Do a synchronize copy to a continugous CPU memory region.
+  *
+  *  This function will call WaitToRead before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copyinto.
+  * \param size the memory size we want to copy into. Defualt value is Size()
+  */
+  void SyncCopyToCPU(mx_float *data, size_t size = 0);
+  /*!
+  * \brief Do a synchronize copy to a continugous CPU memory region.
+  *
+  *  This function will call WaitToRead before the copy is performed.
+  *  This is useful to copy data from existing memory region that are
+  *  not wrapped by NDArray(thus dependency not being tracked).
+  *
+  * \param data the data source to copyinto.
+  * \param size the memory size we want to copy into. Defualt value is Size()
+  */
+  void SyncCopyToCPU(std::vector<mx_float> *data, size_t size = 0);
+  /*!
+  * \brief Copy the content of current array to other.
+  * \param other the new context of this NDArray
+  * \return the new copy
+  */
+  NDArray CopyTo(NDArray * other) const;
+  /*!
+  * \brief return a new copy this NDArray
+  * \param other the target NDArray
+  * \return the copy target NDarray
+  */
+  NDArray Copy(const Context &) const;
+  /*!
+  * \brief return offset of the element at (h, w)
+  * \param h height position
+  * \param w width position
+  * \return offset of two dimensions array
+  */
+  size_t Offset(size_t h = 0, size_t w = 0) const;
+  /*!
+   * \brief return offset of three dimensions array
+   * \param c channel position
+   * \param h height position
+   * \param w width position
+   * \return offset of three dimensions array
+   */
+  size_t Offset(size_t c, size_t h, size_t w) const;
+  /*!
+  * \brief return value of the element at (h, w)
+  * \param h height position
+  * \param w width position
+  * \return value of two dimensions array
+  */
+  mx_float At(size_t h, size_t w) const;
+  /*!
+   * \brief return value of three dimensions array
+   * \param c channel position
+   * \param h height position
+   * \param w width position
+   * \return value of three dimensions array
+   */
+  mx_float At(size_t c, size_t h, size_t w) const;
+  /*!
+  * \brief Slice a NDArray
+  * \param begin begin index in first dim
+  * \param end end index in first dim
+  * \return sliced NDArray
+  */
+  NDArray Slice(mx_uint begin, mx_uint end) const;
+  /*!
+  * \brief Return a reshaped NDArray that shares memory with current one
+  * \param new_shape the new shape
+  * \return reshaped NDarray
+  */
+  NDArray Reshape(const Shape &new_shape) const;
+  /*!
+  * \brief Block until all the pending write operations with respect
+  *    to current NDArray are finished, and read can be performed.
+  */
+  void WaitToRead() const;
+  /*!
+  * \brief Block until all the pending read/write operations with respect
+  *    to current NDArray are finished, and write can be performed.
+  */
+  void WaitToWrite();
+  /*!
+  * \brief Block until all the pending read/write operations with respect
+  *    to current NDArray are finished, and read/write can be performed.
+  */
+  static void WaitAll();
+  /*!
+  * \brief Sample gaussian distribution for each elements of out.
+  * \param mu mean of gaussian distribution.
+  * \param sigma standard deviation of gaussian distribution.
+  * \param out output NDArray.
+  */
+  static void SampleGaussian(mx_float mu, mx_float sigma, NDArray *out);
+  /*!
+  * \brief Sample uniform distribution for each elements of out.
+  * \param begin lower bound of distribution.
+  * \param end upper bound of distribution.
+  * \param out output NDArray.
+  */
+  static void SampleUniform(mx_float begin, mx_float end, NDArray *out);
+  /*!
+  * \brief Load NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \param array_list a list of NDArrays returned, do not fill the list if
+  * nullptr is given.
+  * \param array_map a map from names to NDArrays returned, do not fill the map
+  * if nullptr is given or no names is stored in binary file.
+  */
+  static void Load(const std::string &file_name,
+                   std::vector<NDArray> *array_list = nullptr,
+                   std::map<std::string, NDArray> *array_map = nullptr);
+  /*!
+  * \brief Load map of NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \return a list of NDArrays.
+  */
+  static std::map<std::string, NDArray> LoadToMap(const std::string &file_name);
+  /*!
+  * \brief Load list of NDArrays from binary file.
+  * \param file_name name of the binary file.
+  * \return a map from names to NDArrays.
+  */
+  static std::vector<NDArray> LoadToList(const std::string &file_name);
+  /*!
+  * \brief save a map of string->NDArray to binary file.
+  * \param file_name name of the binary file.
+  * \param array_map a map from names to NDArrays.
+  */
+  static void Save(const std::string &file_name,
+                   const std::map<std::string, NDArray> &array_map);
+  /*!
+  * \brief save a list of NDArrays to binary file.
+  * \param file_name name of the binary file.
+  * \param array_list a list of NDArrays.
+  */
+  static void Save(const std::string &file_name,
+                   const std::vector<NDArray> &array_list);
+  /*!
+  * \return the size of current NDArray, a.k.a. the production of all shape dims
+  */
+  size_t Size() const;
+  /*!
+  * \return the shape of current NDArray, in the form of mx_uint vector
+  */
+  std::vector<mx_uint> GetShape() const;
+  /*!
+  * \return the data type of current NDArray
+  */
+  int GetDType() const;
+  /*!
+  * \return the data pointer to the current NDArray
+  */
+  const mx_float *GetData() const;
+
+  /*!
+  * \return the context of NDArray
+  */
+  Context GetContext() const;
+
+  /*!
+  * \return the NDArrayHandle of the current NDArray
+  */
+  NDArrayHandle GetHandle() const { return blob_ptr_->handle_; }
+
+ private:
+  std::shared_ptr<NDBlob> blob_ptr_;
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_NDARRAY_H_
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
new file mode 100644
index 000000000000..addf0d3870a5
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -0,0 +1,341 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file ndarray.hpp
+ * \brief implementation of the ndarray
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_NDARRAY_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_NDARRAY_HPP_
+
+#include <map>
+#include <string>
+#include <vector>
+#include "dmlc/logging.h"
+#include "mxnet-cpp/ndarray.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline NDArray::NDArray() {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const NDArrayHandle &handle) {
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const std::vector<mx_uint> &shape, const Context &context,
+                        bool delay_alloc) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.size(), context.GetDeviceType(),
+                           context.GetDeviceId(), delay_alloc, &handle),
+           0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const Shape &shape, const Context &context, bool delay_alloc) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), delay_alloc, &handle),
+           0);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const mx_float *data, size_t size) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  MXNDArraySyncCopyFromCPU(handle, data, size);
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const mx_float *data, const Shape &shape,
+                        const Context &context) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), false, &handle),
+           0);
+  MXNDArraySyncCopyFromCPU(handle, data, shape.Size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const std::vector<mx_float> &data, const Shape &shape,
+                        const Context &context) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreate(shape.data(), shape.ndim(), context.GetDeviceType(),
+                           context.GetDeviceId(), false, &handle),
+           0);
+  MXNDArraySyncCopyFromCPU(handle, data.data(), shape.Size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+inline NDArray::NDArray(const std::vector<mx_float> &data) {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArrayCreateNone(&handle), 0);
+  MXNDArraySyncCopyFromCPU(handle, data.data(), data.size());
+  blob_ptr_ = std::make_shared<NDBlob>(handle);
+}
+
+inline NDArray NDArray::operator+(mx_float scalar) {
+  NDArray ret;
+  Operator("_plus_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator-(mx_float scalar) {
+  NDArray ret;
+  Operator("_minus_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator*(mx_float scalar) {
+  NDArray ret;
+  Operator("_mul_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator/(mx_float scalar) {
+  NDArray ret;
+  Operator("_div_scalar")(*this, scalar).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator+(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_plus")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator-(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_minus")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator*(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_mul")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::operator/(const NDArray &rhs) {
+  NDArray ret;
+  Operator("_div")(*this, rhs).Invoke(ret);
+  return ret;
+}
+inline NDArray &NDArray::operator=(mx_float scalar) {
+  Operator("_set_value")(scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator+=(mx_float scalar) {
+  Operator("_plus_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator-=(mx_float scalar) {
+  Operator("_minus_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator*=(mx_float scalar) {
+  Operator("_mul_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator/=(mx_float scalar) {
+  Operator("_div_scalar")(*this, scalar).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator+=(const NDArray &rhs) {
+  Operator("_plus")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator-=(const NDArray &rhs) {
+  Operator("_minus")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator*=(const NDArray &rhs) {
+  Operator("_mul")(*this, rhs).Invoke(*this);
+  return *this;
+}
+inline NDArray &NDArray::operator/=(const NDArray &rhs) {
+  Operator("_div")(*this, rhs).Invoke(*this);
+  return *this;
+}
+
+inline NDArray NDArray::ArgmaxChannel() {
+  NDArray ret;
+  Operator("argmax_channel")(*this).Invoke(ret);
+  return ret;
+}
+
+inline void NDArray::SyncCopyFromCPU(const mx_float *data, size_t size) {
+  MXNDArraySyncCopyFromCPU(blob_ptr_->handle_, data, size);
+}
+inline void NDArray::SyncCopyFromCPU(const std::vector<mx_float> &data) {
+  MXNDArraySyncCopyFromCPU(blob_ptr_->handle_, data.data(), data.size());
+}
+inline void NDArray::SyncCopyToCPU(mx_float *data, size_t size) {
+  MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data, size > 0 ? size : Size());
+}
+inline void NDArray::SyncCopyToCPU(std::vector<mx_float> *data, size_t size) {
+  size = size > 0 ? size : Size();
+  data->resize(size);
+  MXNDArraySyncCopyToCPU(blob_ptr_->handle_, data->data(), size);
+}
+inline NDArray NDArray::Copy(const Context &ctx) const {
+  NDArray ret(GetShape(), ctx);
+  Operator("_copyto")(*this).Invoke(ret);
+  return ret;
+}
+inline NDArray NDArray::CopyTo(NDArray * other) const {
+  Operator("_copyto")(*this).Invoke(*other);
+  return *other;
+}
+inline NDArray NDArray::Slice(mx_uint begin, mx_uint end) const {
+  NDArrayHandle handle;
+  CHECK_EQ(MXNDArraySlice(GetHandle(), begin, end, &handle), 0);
+  return NDArray(handle);
+}
+inline NDArray NDArray::Reshape(const Shape &new_shape) const {
+  NDArrayHandle handle;
+  std::vector<int> dims(new_shape.ndim());
+  for (index_t i = 0; i < new_shape.ndim(); ++i) {
+    dims[i] = new_shape[i];
+  }
+  new_shape.data();
+  CHECK_EQ(
+      MXNDArrayReshape(GetHandle(), new_shape.ndim(), dims.data(), &handle), 0);
+  return NDArray(handle);
+}
+inline void NDArray::WaitToRead() const {
+  CHECK_EQ(MXNDArrayWaitToRead(blob_ptr_->handle_), 0);
+}
+inline void NDArray::WaitToWrite() {
+  CHECK_EQ(MXNDArrayWaitToWrite(blob_ptr_->handle_), 0);
+}
+inline void NDArray::WaitAll() { CHECK_EQ(MXNDArrayWaitAll(), 0); }
+inline void NDArray::SampleGaussian(mx_float mu, mx_float sigma, NDArray *out) {
+  Operator("_sample_normal")(mu, sigma).Invoke(*out);
+}
+inline void NDArray::SampleUniform(mx_float begin, mx_float end, NDArray *out) {
+  Operator("_sample_uniform")(begin, end).Invoke(*out);
+}
+inline void NDArray::Load(const std::string &file_name,
+                          std::vector<NDArray> *array_list,
+                          std::map<std::string, NDArray> *array_map) {
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (array_list != nullptr) {
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_list->push_back(NDArray(out_arr[i]));
+    }
+  }
+  if (array_map != nullptr && out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      (*array_map)[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+}
+inline std::map<std::string, NDArray> NDArray::LoadToMap(
+    const std::string &file_name) {
+  std::map<std::string, NDArray> array_map;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  if (out_name_size > 0) {
+    CHECK_EQ(out_name_size, out_size);
+    for (mx_uint i = 0; i < out_size; ++i) {
+      array_map[out_names[i]] = NDArray(out_arr[i]);
+    }
+  }
+  return array_map;
+}
+inline std::vector<NDArray> NDArray::LoadToList(const std::string &file_name) {
+  std::vector<NDArray> array_list;
+  mx_uint out_size, out_name_size;
+  NDArrayHandle *out_arr;
+  const char **out_names;
+  CHECK_EQ(MXNDArrayLoad(file_name.c_str(), &out_size, &out_arr, &out_name_size,
+                         &out_names),
+           0);
+  for (mx_uint i = 0; i < out_size; ++i) {
+    array_list.push_back(NDArray(out_arr[i]));
+  }
+  return array_list;
+}
+inline void NDArray::Save(const std::string &file_name,
+                          const std::map<std::string, NDArray> &array_map) {
+  std::vector<NDArrayHandle> args;
+  std::vector<const char *> keys;
+  for (const auto &t : array_map) {
+    args.push_back(t.second.GetHandle());
+    keys.push_back(t.first.c_str());
+  }
+  CHECK_EQ(
+      MXNDArraySave(file_name.c_str(), args.size(), args.data(), keys.data()),
+      0);
+}
+inline void NDArray::Save(const std::string &file_name,
+                          const std::vector<NDArray> &array_list) {
+  std::vector<NDArrayHandle> args;
+  for (const auto &t : array_list) {
+    args.push_back(t.GetHandle());
+  }
+  CHECK_EQ(MXNDArraySave(file_name.c_str(), args.size(), args.data(), nullptr),
+           0);
+}
+
+inline size_t NDArray::Offset(size_t h, size_t w) const {
+  return (h * GetShape()[1]) + w;
+}
+
+inline size_t NDArray::Offset(size_t c, size_t h, size_t w) const {
+  auto const shape = GetShape();
+  return h * shape[0] * shape[2] + w * shape[0] + c;
+}
+
+inline mx_float NDArray::At(size_t h, size_t w) const {
+  return GetData()[Offset(h, w)];
+}
+
+inline mx_float NDArray::At(size_t c, size_t h, size_t w) const {
+  return GetData()[Offset(c, h, w)];
+}
+
+inline size_t NDArray::Size() const {
+  size_t ret = 1;
+  for (auto &i : GetShape()) ret *= i;
+  return ret;
+}
+
+inline std::vector<mx_uint> NDArray::GetShape() const {
+  const mx_uint *out_pdata;
+  mx_uint out_dim;
+  MXNDArrayGetShape(blob_ptr_->handle_, &out_dim, &out_pdata);
+  std::vector<mx_uint> ret;
+  for (mx_uint i = 0; i < out_dim; ++i) {
+    ret.push_back(out_pdata[i]);
+  }
+  return ret;
+}
+
+inline int NDArray::GetDType() const {
+  int ret;
+  MXNDArrayGetDType(blob_ptr_->handle_, &ret);
+  return ret;
+}
+
+inline const mx_float *NDArray::GetData() const {
+  void *ret;
+  CHECK_NE(GetContext().GetDeviceType(), DeviceType::kGPU);
+  MXNDArrayGetData(blob_ptr_->handle_, &ret);
+  if (GetDType() != 0) {
+    return NULL;
+  }
+  return static_cast<mx_float*>(ret);
+}
+
+inline Context NDArray::GetContext() const {
+  int out_dev_type;
+  int out_dev_id;
+  MXNDArrayGetContext(blob_ptr_->handle_, &out_dev_type, &out_dev_id);
+  return Context((DeviceType)out_dev_type, out_dev_id);
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_NDARRAY_HPP_
diff --git a/cpp-package/include/mxnet-cpp/op_map.h b/cpp-package/include/mxnet-cpp/op_map.h
new file mode 100644
index 000000000000..2a2ae50a4e84
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_map.h
@@ -0,0 +1,92 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op_map.h
+* \brief definition of OpMap
+* \author Chuntao Hong
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_MAP_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_MAP_H_
+
+#include <map>
+#include <string>
+#include "mxnet-cpp/base.h"
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief OpMap instance holds a map of all the symbol creators so we can
+*  get symbol creators by name.
+*  This is used internally by Symbol and Operator.
+*/
+class OpMap {
+ public:
+  /*!
+  * \brief Create an Mxnet instance
+  */
+  inline OpMap() {
+    mx_uint num_symbol_creators = 0;
+    AtomicSymbolCreator *symbol_creators = nullptr;
+    int r =
+      MXSymbolListAtomicSymbolCreators(&num_symbol_creators, &symbol_creators);
+    CHECK_EQ(r, 0);
+    for (mx_uint i = 0; i < num_symbol_creators; i++) {
+      const char *name;
+      const char *description;
+      mx_uint num_args;
+      const char **arg_names;
+      const char **arg_type_infos;
+      const char **arg_descriptions;
+      const char *key_var_num_args;
+      r = MXSymbolGetAtomicSymbolInfo(symbol_creators[i], &name, &description,
+        &num_args, &arg_names, &arg_type_infos,
+        &arg_descriptions, &key_var_num_args);
+      CHECK_EQ(r, 0);
+      symbol_creators_[name] = symbol_creators[i];
+    }
+
+    nn_uint num_ops;
+    const char **op_names;
+    r = NNListAllOpNames(&num_ops, &op_names);
+    CHECK_EQ(r, 0);
+    for (nn_uint i = 0; i < num_ops; i++) {
+      OpHandle handle;
+      r = NNGetOpHandle(op_names[i], &handle);
+      CHECK_EQ(r, 0);
+      op_handles_[op_names[i]] = handle;
+    }
+  }
+
+  /*!
+  * \brief Get a symbol creator with its name.
+  *
+  * \param name name of the symbol creator
+  * \return handle to the symbol creator
+  */
+  inline AtomicSymbolCreator GetSymbolCreator(const std::string &name) {
+    if (symbol_creators_.count(name) == 0)
+      return GetOpHandle(name);
+    return symbol_creators_[name];
+  }
+
+  /*!
+  * \brief Get an op handle with its name.
+  *
+  * \param name name of the op
+  * \return handle to the op
+  */
+  inline OpHandle GetOpHandle(const std::string &name) {
+    return op_handles_[name];
+  }
+
+ private:
+  std::map<std::string, AtomicSymbolCreator> symbol_creators_;
+  std::map<std::string, OpHandle> op_handles_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_MAP_H_
diff --git a/cpp-package/include/mxnet-cpp/op_suppl.h b/cpp-package/include/mxnet-cpp/op_suppl.h
new file mode 100644
index 000000000000..5eb86d8ef275
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_suppl.h
@@ -0,0 +1,188 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file op_suppl.h
+* \brief A supplement and amendment of the operators from op.h
+* \author Zhang Chen, zhubuntu, Xin Li
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_SUPPL_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_SUPPL_H_
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/shape.h"
+#include "mxnet-cpp/operator.h"
+#include "mxnet-cpp/MxNetCpp.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline Symbol _Plus(Symbol lhs, Symbol rhs) {
+  return Operator("_Plus")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Mul(Symbol lhs, Symbol rhs) {
+  return Operator("_Mul")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Minus(Symbol lhs, Symbol rhs) {
+  return Operator("_Minus")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Div(Symbol lhs, Symbol rhs) {
+  return Operator("_Div")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Power(Symbol lhs, Symbol rhs) {
+  return Operator("_Power")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Maximum(Symbol lhs, Symbol rhs) {
+  return Operator("_Maximum")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _Minimum(Symbol lhs, Symbol rhs) {
+  return Operator("_Minimum")(lhs, rhs)
+           .CreateSymbol();
+}
+inline Symbol _PlusScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_PlusScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MinusScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MinusScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RMinusScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RMinusScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MulScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MulScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _DivScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_DivScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RDivScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RDivScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _PowerScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_PowerScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _RPowerScalar(mx_float scalar, Symbol rhs) {
+  return Operator("_RPowerScalar")(rhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MaximumScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MaximumScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+inline Symbol _MinimumScalar(Symbol lhs, mx_float scalar) {
+  return Operator("_MinimumScalar")(lhs)
+           .SetParam("scalar", scalar)
+           .CreateSymbol();
+}
+// TODO(zhangcheng-qinyinghua)
+//  make crop function run in op.h
+//  This function is due to [zhubuntu](https://github.com/zhubuntu)
+inline Symbol Crop(const std::string& symbol_name,
+    int num_args,
+    Symbol data,
+    Symbol crop_like,
+    Shape offset = Shape(0, 0),
+    Shape h_w = Shape(0, 0),
+    bool center_crop = false) {
+  return Operator("Crop")
+    .SetParam("num_args", num_args)
+    .SetParam("offset", offset)
+    .SetParam("h_w", h_w)
+    .SetParam("center_crop", center_crop)
+    .SetInput("arg0", data)
+    .SetInput("arg1", crop_like)
+    .CreateSymbol(symbol_name);
+}
+
+
+/*!
+ * \breif Slice input equally along specified axis.
+ * \param data input symbol.
+ * \param num_outputs Number of outputs to be sliced.
+ * \param axis Dimension along which to slice.
+ * \param squeeze_axis If true AND the sliced dimension becomes 1, squeeze that dimension.
+ * \return new symbol
+ */
+inline Symbol SliceChannel(Symbol data,
+                           int num_outputs,
+                           int axis = 1,
+                           bool squeeze_axis = false) {
+  return Operator("SliceChannel")
+           .SetParam("num_outputs", num_outputs)
+           .SetParam("axis", axis)
+           .SetParam("squeeze_axis", squeeze_axis) (data)
+           .CreateSymbol();
+}
+
+
+/*!
+ * \breif Slice input equally along specified axis.
+ * \param symbol_name name of the resulting symbol.
+ * \param data input symbol.
+ * \param num_outputs Number of outputs to be sliced.
+ * \param axis Dimension along which to slice.
+ * \param squeeze_axis If true AND the sliced dimension becomes 1, squeeze that dimension.
+ * \return new symbol
+ */
+inline Symbol SliceChannel(const std::string& symbol_name,
+                           Symbol data,
+                           int num_outputs,
+                           int axis = 1,
+                           bool squeeze_axis = false) {
+  return Operator("SliceChannel")
+           .SetParam("num_outputs", num_outputs)
+           .SetParam("axis", axis)
+           .SetParam("squeeze_axis", squeeze_axis) (data)
+           .CreateSymbol(symbol_name);
+}
+
+/*!
+ * \breif Apply activation function to input.
+ *        Softmax Activation is only available with CUDNN on GPUand will be
+ *        computed at each location across channel if input is 4D.
+ * \param symbol_name name of the resulting symbol.
+ * \param data Input data to activation function.
+ * \param act_type Activation function to be applied.
+ * \return new symbol
+ */
+inline Symbol Activation(const std::string& symbol_name,
+                         Symbol data,
+                         const std::string& act_type) {
+  assert(act_type == "relu" ||
+         act_type == "sigmoid" ||
+         act_type == "softrelu" ||
+         act_type == "tanh");
+  return Operator("Activation")
+           .SetParam("act_type", act_type.c_str())
+           .SetInput("data", data)
+           .CreateSymbol(symbol_name);
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_SUPPL_H_
+
diff --git a/cpp-package/include/mxnet-cpp/op_util.h b/cpp-package/include/mxnet-cpp/op_util.h
new file mode 100644
index 000000000000..bf67eab4c1ae
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/op_util.h
@@ -0,0 +1,46 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file op_util.h
+* \brief operator helper functions
+* \author Chris Olivier
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_UTIL_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_UTIL_H_
+
+#include <string>
+
+#if defined(MXNET_USE_CAFFE) && MXNET_USE_CAFFE != 0
+#include <caffe/proto/caffe.pb.h>
+#include <google/protobuf/text_format.h>
+#endif
+
+namespace mxnet {
+namespace cpp {
+
+#if defined(MXNET_USE_CAFFE) && MXNET_USE_CAFFE != 0
+
+inline ::caffe::LayerParameter textToCaffeLayerParameter(const std::string& text) {
+  caffe::NetParameter np;
+  const bool success = google::protobuf::TextFormat::ParseFromString(text, &np);
+  CHECK_EQ(success, true) << "Invalid protpbuf layer string: " << text;
+  return ::caffe::LayerParameter(np.layer(0));
+}
+
+template<typename StreamType>
+inline StreamType& operator << (StreamType& os, const ::caffe::LayerParameter& op) {
+  std::string s;
+  caffe::NetParameter np;
+  // Avoid wasting time making a copy -- just push in out default object's pointer
+  np.mutable_layer()->AddAllocated(const_cast<::caffe::LayerParameter *>(&op));
+  google::protobuf::TextFormat::PrintToString(np, &s);
+  np.mutable_layer()->ReleaseLast();
+  os << s;
+  return os;
+}
+#endif
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_UTIL_H_
diff --git a/cpp-package/include/mxnet-cpp/operator.h b/cpp-package/include/mxnet-cpp/operator.h
new file mode 100644
index 000000000000..6677f86aeb75
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/operator.h
@@ -0,0 +1,188 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.h
+* \brief definition of operator
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OPERATOR_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OPERATOR_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/symbol.h"
+
+namespace mxnet {
+namespace cpp {
+class Mxnet;
+/*!
+* \brief Operator interface
+*/
+class Operator {
+ public:
+  /*!
+  * \brief Operator constructor
+  * \param operator_name type of the operator
+  */
+  explicit Operator(const std::string &operator_name);
+  Operator &operator=(const Operator &rhs);
+  /*!
+  * \brief set config parameters
+  * \param name name of the config parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Operator &SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return *this;
+  }
+  /*!
+  * \brief set config parameters from positional inputs
+  * \param pos the position of parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Operator &SetParam(int pos, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[arg_names_[pos]] = value_str;
+    return *this;
+  }
+  /*!
+  * \brief add an input symbol
+  * \param name name of the input symbol
+  * \param symbol the input symbol
+  * \return reference of self
+  */
+  Operator &SetInput(const std::string &name, Symbol symbol);
+  /*!
+  * \brief add an input symbol
+  * \param symbol the input symbol
+  */
+  template<int N = 0>
+  void PushInput(const Symbol &symbol) {
+    input_symbols.push_back(symbol.GetHandle());
+  }
+  /*!
+  * \brief add input symbols
+  * \return reference of self
+  */
+  Operator &operator()() { return *this; }
+  /*!
+  * \brief add input symbols
+  * \param symbol the input symbol
+  * \return reference of self
+  */
+  Operator &operator()(const Symbol &symbol) {
+    input_symbols.push_back(symbol.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add a list of input symbols
+  * \param symbols the vector of the input symbols
+  * \return reference of self
+  */
+  Operator &operator()(const std::vector<Symbol> &symbols) {
+    for (auto &s : symbols) {
+      input_symbols.push_back(s.GetHandle());
+    }
+    return *this;
+  }
+  /*!
+  * \brief create a Symbol from the current operator
+  * \param name the name of the operator
+  * \return the operator Symbol
+  */
+  Symbol CreateSymbol(const std::string &name = "");
+
+  /*!
+  * \brief add an input ndarray
+  * \param name name of the input ndarray
+  * \param ndarray the input ndarray
+  * \return reference of self
+  */
+  Operator &SetInput(const std::string &name, NDArray ndarray);
+  /*!
+  * \brief add an input ndarray
+  * \param ndarray the input ndarray
+  */
+  template<int N = 0>
+  void PushInput(const NDArray &ndarray) {
+    input_ndarrays.push_back(ndarray.GetHandle());
+  }
+  /*!
+  * \brief add positional inputs
+  */
+  template <class T, class... Args, int N = 0>
+  void PushInput(const T &t, Args... args) {
+    SetParam(N, t);
+    PushInput<Args..., N+1>(args...);
+  }
+  /*!
+  * \brief add the last positional input
+  */
+  template <class T, int N = 0>
+  void PushInput(const T &t) {
+    SetParam(N, t);
+  }
+  /*!
+  * \brief add input ndarrays
+  * \param ndarray the input ndarray
+  * \return reference of self
+  */
+  Operator &operator()(const NDArray &ndarray) {
+    input_ndarrays.push_back(ndarray.GetHandle());
+    return *this;
+  }
+  /*!
+  * \brief add a list of input ndarrays
+  * \param ndarrays the vector of the input ndarrays
+  * \return reference of self
+  */
+  Operator &operator()(const std::vector<NDArray> &ndarrays) {
+    for (auto &s : ndarrays) {
+      input_ndarrays.push_back(s.GetHandle());
+    }
+    return *this;
+  }
+  /*!
+  * \brief add input ndarrays
+  * \return reference of self
+  */
+  template <typename... Args>
+  Operator &operator()(Args... args) {
+    PushInput(args...);
+    return *this;
+  }
+  std::vector<NDArray> Invoke();
+  void Invoke(NDArray &output);
+  void Invoke(std::vector<NDArray> &outputs);
+
+ private:
+  std::map<std::string, std::string> params_desc_;
+  bool variable_params_ = false;
+  std::map<std::string, std::string> params_;
+  std::vector<SymbolHandle> input_symbols;
+  std::vector<NDArrayHandle> input_ndarrays;
+  std::vector<std::string> input_keys;
+  std::vector<std::string> arg_names_;
+  AtomicSymbolCreator handle_;
+  static OpMap*& op_map();
+};
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OPERATOR_H_
diff --git a/cpp-package/include/mxnet-cpp/operator.hpp b/cpp-package/include/mxnet-cpp/operator.hpp
new file mode 100644
index 000000000000..b979b7c56d73
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/operator.hpp
@@ -0,0 +1,158 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file operator.hpp
+* \brief implementation of operator
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OPERATOR_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OPERATOR_HPP_
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <iterator>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/operator.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*
+ * Pushing NDArray or Symbol as inputs here to avoid partial specialization
+ * like PushInput<NDArray, Args..., N>, which is not allowed in C++
+ */
+template <>
+inline Operator& Operator::SetParam<NDArray>(int pos, const NDArray &value) {
+  input_ndarrays.push_back(value.GetHandle());
+  return *this;
+}
+template <>
+inline Operator& Operator::SetParam<Symbol>(int pos, const Symbol &value) {
+  input_symbols.push_back(value.GetHandle());
+  return *this;
+}
+
+inline OpMap*& Operator::op_map() {
+  static OpMap *op_map_ = new OpMap();
+  return op_map_;
+}
+
+inline Operator::Operator(const std::string &operator_name) {
+  handle_ = op_map()->GetSymbolCreator(operator_name);
+  const char *name;
+  const char *description;
+  mx_uint num_args;
+  const char **arg_names;
+  const char **arg_type_infos;
+  const char **arg_descriptions;
+  const char *key_var_num_args;
+  MXSymbolGetAtomicSymbolInfo(handle_,
+      &name,
+      &description,
+      &num_args,
+      &arg_names,
+      &arg_type_infos,
+      &arg_descriptions,
+      &key_var_num_args);
+  for (mx_uint i = 0; i < num_args; ++i) {
+    arg_names_.push_back(arg_names[i]);
+  }
+}
+
+inline Symbol Operator::CreateSymbol(const std::string &name) {
+  if (input_keys.size() > 0) {
+    CHECK_EQ(input_keys.size(), input_symbols.size());
+  }
+  const char *pname = name == "" ? nullptr : name.c_str();
+
+  SymbolHandle symbol_handle;
+  std::vector<const char *> input_keys;
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+  for (auto &data : this->input_keys) {
+    input_keys.push_back(data.c_str());
+  }
+  const char **input_keys_p =
+      (input_keys.size() > 0) ? input_keys.data() : nullptr;
+
+  MXSymbolCreateAtomicSymbol(handle_, param_keys.size(), param_keys.data(),
+                             param_values.data(), &symbol_handle);
+  MXSymbolCompose(symbol_handle, pname, input_symbols.size(), input_keys_p,
+                  input_symbols.data());
+  return Symbol(symbol_handle);
+}
+
+inline void Operator::Invoke(std::vector<NDArray> &outputs) {
+  if (input_keys.size() > 0) {
+    CHECK_EQ(input_keys.size(), input_ndarrays.size());
+  }
+
+  std::vector<const char *> input_keys;
+  std::vector<const char *> param_keys;
+  std::vector<const char *> param_values;
+
+  for (auto &data : params_) {
+    param_keys.push_back(data.first.c_str());
+    param_values.push_back(data.second.c_str());
+  }
+
+  int num_inputs = input_ndarrays.size();
+  int num_outputs = outputs.size();
+  std::vector<NDArrayHandle> output_handles;
+  std::transform(outputs.begin(), outputs.end(),
+      std::back_inserter(output_handles), [](NDArray& a) {
+        return a.GetHandle();
+      });
+
+  NDArrayHandle *outputs_receiver = nullptr;
+  if (num_outputs > 0) {
+    outputs_receiver = output_handles.data();
+  }
+
+  MXImperativeInvoke(handle_, num_inputs, input_ndarrays.data(),
+      &num_outputs, &outputs_receiver,
+      param_keys.size(), param_keys.data(), param_values.data());
+
+  if (outputs.size() > 0)
+    return;
+
+  std::transform(outputs_receiver, outputs_receiver+num_outputs,
+      std::back_inserter(outputs), [](const NDArrayHandle& handle) {
+        return NDArray(handle);
+      });
+}
+
+inline std::vector<NDArray> Operator::Invoke() {
+  std::vector<NDArray> outputs;
+  Invoke(outputs);
+  return outputs;
+}
+
+inline void Operator::Invoke(NDArray &output) {
+  std::vector<NDArray> outputs{output};
+  Invoke(outputs);
+}
+
+inline Operator &Operator::SetInput(const std::string &name, Symbol symbol) {
+  input_keys.push_back(name.c_str());
+  input_symbols.push_back(symbol.GetHandle());
+  return *this;
+}
+
+inline Operator &Operator::SetInput(const std::string &name, NDArray ndarray) {
+  input_keys.push_back(name.c_str());
+  input_ndarrays.push_back(ndarray.GetHandle());
+  return *this;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OPERATOR_HPP_
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
new file mode 100644
index 000000000000..80481fd282b4
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -0,0 +1,122 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file optimizer.h
+* \brief definition of optimizer
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OPTIMIZER_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OPTIMIZER_H_
+
+#include <map>
+#include <vector>
+#include <string>
+#include <memory>
+#include <functional>
+#include "mxnet-cpp/base.h"
+#include "dmlc/logging.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief Optimizer interface
+*/
+class Optimizer {
+ public:
+  /*!
+  * \brief get optimizer type
+  * \return string of optimizer type
+  */
+  virtual std::string GetType() const = 0;
+  /*!
+  * \brief destructor
+  */
+  virtual ~Optimizer();
+  /*!
+  * \brief set config parameters
+  * \param name name of the config parameter
+  * \param value value of the config parameter
+  * \return reference of self
+  */
+  template <typename T>
+  Optimizer *SetParam(const std::string &name, const T &value) {
+    std::string value_str;
+    std::stringstream ss;
+    ss << value;
+    ss >> value_str;
+
+    params_[name] = value_str;
+    return this;
+  }
+  /*!
+  *  \brief Update a weight with gradient.
+  *  \param index the unique index for the weight.
+  *  \param weight the weight to update.
+  *  \param grad gradient for the weight.
+  *  \param lr learning rate.
+  *  \param wd weight decay.
+  */
+  void Update(int index, NDArray weight, NDArray grad, mx_float lr,
+              mx_float wd);
+  /*!
+  *  \brief Update a weight with gradient.
+  *  \param index the unique index for the weight.
+  *  \param weight the weight to update.
+  *  \param grad gradient for the weight.
+  */
+  virtual void Update(int index, NDArray weight, NDArray grad) = 0;
+  // TODO(zhangcheng-qinyinghua)
+  // implement Update a list of arrays, maybe in the form of map
+  // void Update(int index, std::vector<NDArray> weights, std::vector<NDArray>
+  // grad, mx_float lr);
+
+  /*!
+  *  \brief Serialize the optimizer parameters to a string.
+  *  \return serialization
+  */
+  std::string Serialize() const;
+
+ protected:
+  std::map<std::string, std::string> params_;
+  static OpMap*& op_map();
+  const std::vector<const char*> GetParamKeys_() const;
+  const std::vector<const char*> GetParamValues_() const;
+};
+
+typedef std::function<Optimizer*()> OptimizerCreator;
+
+class OptimizerRegistry {
+ public:
+  static Optimizer* Find(const std::string& name);
+  static int __REGISTER__(const std::string& name, OptimizerCreator creator);
+ private:
+  static std::map<std::string, OptimizerCreator>& cmap();
+  OptimizerRegistry() = delete;
+  ~OptimizerRegistry() = delete;
+};
+
+#define MXNETCPP_REGISTER_OPTIMIZER(Name, OptimizerType)          \
+  static int __make_ ## OptimizerType ## _ ## Name ## __ = \
+       OptimizerRegistry::__REGISTER__(#Name, [](){return new OptimizerType();})
+
+class SGDOptimizer : public Optimizer {
+ public:
+  SGDOptimizer();
+  virtual std::string GetType() const;
+  virtual void Update(int index, NDArray weight, NDArray grad);
+ private:
+  virtual ~SGDOptimizer();
+  virtual void CreateState_(int index, NDArray weight);
+  std::map<int, NDArray*> states_;
+  AtomicSymbolCreator update_handle_;
+  AtomicSymbolCreator mom_update_handle_;
+};
+
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OPTIMIZER_H_
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
new file mode 100644
index 000000000000..911989de1e95
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -0,0 +1,139 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file optimizer.hpp
+* \brief implementation of optimizer
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OPTIMIZER_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OPTIMIZER_HPP_
+
+#include <algorithm>
+#include <utility>
+#include <numeric>
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/optimizer.h"
+#include "mxnet-cpp/op.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+inline std::map<std::string, OptimizerCreator>& OptimizerRegistry::cmap() {
+  static std::map<std::string, OptimizerCreator> cmap_;
+  return cmap_;
+}
+
+inline OpMap*& Optimizer::op_map() {
+  static OpMap *op_map_ = new OpMap();
+  return op_map_;
+}
+
+inline Optimizer::~Optimizer() {}
+
+inline void Optimizer::Update(int index, NDArray weight, NDArray grad, mx_float lr,
+                       mx_float wd) {
+  params_["lr"] = std::to_string(lr);
+  params_["wd"] = std::to_string(wd);
+  Update(index, weight, grad);
+}
+
+inline std::string Optimizer::Serialize() const {
+  using ValueType = std::map<std::string, std::string>::value_type;
+  auto params = params_;
+  params.emplace("opt_type", GetType());
+  return std::accumulate(params.cbegin(), params.cend(), std::string(""),
+    [](const std::string& sum, const ValueType& i) {
+      return sum + '\n' + i.first + '=' + i.second;
+    }).substr(1);
+}
+
+inline const std::vector<const char*> Optimizer::GetParamKeys_() const {
+  std::vector<const char*> keys;
+  for (auto& iter : params_)
+    keys.push_back(iter.first.c_str());
+  return keys;
+}
+
+inline const std::vector<const char*> Optimizer::GetParamValues_() const {
+  std::vector<const char*> values;
+  for (auto& iter : params_)
+    values.push_back(iter.second.c_str());
+  return values;
+}
+
+inline Optimizer* OptimizerRegistry::Find(const std::string& name) {
+  MXNETCPP_REGISTER_OPTIMIZER(sgd, SGDOptimizer);
+  MXNETCPP_REGISTER_OPTIMIZER(ccsgd, SGDOptimizer);  // For backward compatibility
+  auto it = cmap().find(name);
+  if (it == cmap().end())
+    return nullptr;
+  return it->second();
+}
+
+inline int OptimizerRegistry::__REGISTER__(const std::string& name, OptimizerCreator creator) {
+  CHECK_EQ(cmap().count(name), 0) << name << " already registered";
+  cmap().emplace(name, std::move(creator));
+  return 0;
+}
+
+inline std::string SGDOptimizer::GetType() const {
+  return "sgd";
+}
+
+inline SGDOptimizer::SGDOptimizer() {
+  update_handle_ = op_map()->GetSymbolCreator("sgd_update");
+  mom_update_handle_ = op_map()->GetSymbolCreator("sgd_mom_update");
+}
+
+inline SGDOptimizer::~SGDOptimizer() {
+  for (auto &it : states_) {
+    delete it.second;
+  }
+}
+
+inline void SGDOptimizer::Update(int index, NDArray weight, NDArray grad) {
+  if (states_.count(index) == 0) {
+    CreateState_(index, weight);
+  }
+
+  auto keys = GetParamKeys_();
+  auto values = GetParamValues_();
+  CHECK_EQ(keys.size(), values.size());
+
+  NDArrayHandle inputs[3];
+  inputs[0] = weight.GetHandle();
+  inputs[1] = grad.GetHandle();
+
+  int num_outputs = 1;
+  NDArrayHandle output = weight.GetHandle();
+  NDArrayHandle *outputs = &output;
+
+  if (states_[index] == nullptr) {
+    MXImperativeInvoke(update_handle_, 2, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data());
+  } else {
+    inputs[2] = states_[index]->GetHandle();
+    MXImperativeInvoke(mom_update_handle_, 3, inputs,
+        &num_outputs, &outputs,
+        keys.size(), keys.data(), values.data());
+  }
+}
+
+inline void SGDOptimizer::CreateState_(int index, NDArray weight) {
+  if (params_.count("momentum") == 0) {
+    states_[index] = nullptr;
+  } else {
+    states_[index] = new NDArray(weight.GetShape(), weight.GetContext());
+    *states_[index] = 0;
+  }
+}
+
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OPTIMIZER_HPP_
diff --git a/cpp-package/include/mxnet-cpp/shape.h b/cpp-package/include/mxnet-cpp/shape.h
new file mode 100644
index 000000000000..d8e3f2c95282
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/shape.h
@@ -0,0 +1,389 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file shape.h
+* \brief definition of shape
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_SHAPE_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_SHAPE_H_
+
+#include <istream>
+#include <ostream>
+#include <algorithm>
+#include <vector>
+#include "mxnet-cpp/base.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief dynamic shape class that can hold shape
+*   of arbirary dimension
+*/
+struct Shape {
+ public:
+  /*! \brief constructor */
+  Shape()
+    : ndim_(0),
+    num_heap_allocated_(0),
+    data_heap_(NULL) {}
+  /*!
+  * \brief constructor from a vector of index_t
+  * \param v the vector
+  */
+  explicit Shape(const std::vector<index_t> &v)
+    : ndim_(v.size()) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      std::copy(v.begin(), v.end(), data_stack_);
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      std::copy(v.begin(), v.end(), data_heap_);
+    }
+  }
+  /*!
+  * \brief constructor one dimmension shape
+  * \param s1 size of the first dimmension
+  */
+  explicit Shape(index_t s1)
+    : ndim_(1) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+    }
+  }
+  /*!
+  * \brief constructor two dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  */
+  Shape(index_t s1, index_t s2)
+    : ndim_(2) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+    }
+  }
+  /*!
+  * \brief constructor three dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3)
+    : ndim_(3) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+    }
+  }
+  /*!
+  * \brief constructor four dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  * \param s4 size of the fourth dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4)
+    : ndim_(4) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+      data_stack_[3] = s4;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+      data_heap_[3] = s4;
+    }
+  }
+  /*!
+  * \brief constructor five dimmension shape
+  * \param s1 size of the first dimmension
+  * \param s2 size of the second dimmension
+  * \param s3 size of the third dimmension
+  * \param s4 size of the fourth dimmension
+  * \param s5 size of the fifth dimmension
+  */
+  Shape(index_t s1, index_t s2, index_t s3, index_t s4, index_t s5)
+    : ndim_(5) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      data_stack_[0] = s1;
+      data_stack_[1] = s2;
+      data_stack_[2] = s3;
+      data_stack_[3] = s4;
+      data_stack_[4] = s5;
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      data_heap_[0] = s1;
+      data_heap_[1] = s2;
+      data_heap_[2] = s3;
+      data_heap_[3] = s4;
+      data_heap_[5] = s5;
+    }
+  }
+  /*!
+  * \brief constructor from Shape
+  * \param s the source shape
+  */
+  Shape(const Shape &s)
+    : ndim_(s.ndim_) {
+    if (ndim_ <= kStackCache) {
+      data_heap_ = NULL;
+      num_heap_allocated_ = 0;
+      std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
+    } else {
+      data_heap_ = new index_t[ndim_];
+      num_heap_allocated_ = ndim_;
+      std::copy(s.data_heap_, s.data_heap_ + ndim_, data_heap_);
+    }
+  }
+#if MSHADOW_IN_CXX11
+  /*!
+  * \brief move constructor from Shape
+  * \param s the source shape
+  */
+  Shape(Shape &&s)
+    : ndim_(s.ndim_),
+    num_heap_allocated_(s.num_heap_allocated_),
+    data_heap_(s.data_heap_) {
+    if (ndim_ <= kStackCache) {
+      std::copy(s.data_stack_, s.data_stack_ + ndim_, data_stack_);
+    }
+    // remove data heap space from s
+    s.data_heap_ = NULL;
+  }
+#endif
+  /*! \brief destructor */
+  ~Shape() {
+    // data_heap_ can be NULL
+    delete[] data_heap_;
+  }
+  /*!
+  * \brief copy shape from content betwen two iterators
+  * \param begin the beginning of iterator
+  * \param end the end of the iterator
+  * \tparam RandomAccessIterator iterator type
+  */
+  template<typename RandomAccessIterator>
+  inline void CopyFrom(RandomAccessIterator begin,
+    RandomAccessIterator end) {
+    this->SetDim(end - begin);
+    std::copy(begin, end, data());
+  }
+  /*!
+  * \brief assignment from shape
+  * \param shape source shape
+  * \return reference of self
+  */
+  inline Shape &operator=(const Shape &shape) {
+    this->SetDim(shape.ndim_);
+    const index_t *src = shape.data();
+    std::copy(src, src + ndim_, data());
+    return *this;
+  }
+  /*!
+  * \brief assignment from vector
+  * \param shape source shape
+  * \return reference of self
+  */
+  inline Shape &operator=(const std::vector<index_t> &shape) {
+    this->CopyFrom(shape.begin(), shape.end());
+    return *this;
+  }
+  /*! \return the data content of the shape */
+  inline const index_t *data() const {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \return the data content of the shape */
+  inline index_t *data() {
+    return ndim_ <= kStackCache ? data_stack_ : data_heap_;
+  }
+  /*! \brief return number of dimension of the tensor inside */
+  inline index_t ndim(void) const {
+    return ndim_;
+  }
+  /*!
+  * \brief get corresponding index
+  * \param i dimension index
+  * \return the corresponding dimension size
+  */
+  inline index_t &operator[](index_t i) {
+    return data()[i];
+  }
+  /*!
+  * \brief get corresponding index
+  * \param i dimension index
+  * \return the corresponding dimension size
+  */
+  inline const index_t &operator[](index_t i) const {
+    return data()[i];
+  }
+  /*! \brief total number of elements in the tensor */
+  inline size_t Size(void) const {
+    size_t size = 1;
+    const index_t *d = this->data();
+    for (index_t i = 0; i < ndim_; ++i) {
+      size *= d[i];
+    }
+    return size;
+  }
+  /*!
+  * \return whether two shape equals
+  * \param s the shape to compare against
+  */
+  inline bool operator==(const Shape &s) const {
+    if (ndim_ != s.ndim_) return false;
+    if (ndim_ <= kStackCache) {
+      for (index_t i = 0; i < ndim_; ++i) {
+        if (data_stack_[i] != s.data_stack_[i]) return false;
+      }
+    } else {
+      for (index_t i = 0; i < ndim_; ++i) {
+        if (data_heap_[i] != s.data_heap_[i]) return false;
+      }
+    }
+    return true;
+  }
+  /*!
+  * \return whether two shape not equals
+  * \param s the shape to compare against
+  */
+  inline bool operator!=(const Shape &s) const {
+    return !(*this == s);
+  }
+
+  friend std::ostream &operator<<(std::ostream &os, const Shape &shape);
+  friend std::istream &operator>>(std::istream &is, Shape &shape);
+
+ private:
+  // the shape will be stored in data_stack_
+  // when dimension is smaller than kStackCache
+  // when it is bigger, it will be stored in data_heap_;
+  /*! \brief size of in stack space */
+  static const index_t kStackCache = 5;
+  /*! \brief number of dimnsion of the shape */
+  index_t ndim_;
+  /*! \brief number of cells allocated in data_heap_ */
+  index_t num_heap_allocated_;
+  /*! \brief in stack space used to store shape when it is small */
+  index_t data_stack_[kStackCache];
+  /*! \brief space to store shape when dimension is big*/
+  index_t *data_heap_;
+  /*!
+  * \brief internal function to set the dimension
+  * \param dim the dimension of the shape
+  */
+  inline void SetDim(index_t dim) {
+    if (dim > kStackCache &&
+      dim > num_heap_allocated_) {
+      // data_heap_ can be NULL
+      delete[] data_heap_;
+      data_heap_ = new index_t[dim];
+      num_heap_allocated_ = dim;
+    }
+    ndim_ = dim;
+  }
+};
+
+/*!
+* \brief allow string printing of the shape
+* \param os the output stream
+* \param shape the shape
+* \return the ostream
+*/
+inline std::ostream &operator<<(std::ostream &os, const Shape &shape) {
+  os << '(';
+  for (index_t i = 0; i < shape.ndim(); ++i) {
+    if (i != 0) os << ',';
+    os << static_cast<int>(shape[i]);  // Supports negative Shape 'special codes' for inferring
+  }
+  // python style tuple
+  if (shape.ndim() == 1) os << ',';
+  os << ')';
+  return os;
+}
+
+/*!
+* \brief read shape from the istream
+* \param is the input stream
+* \param shape the shape
+* \return the istream
+*/
+inline std::istream &operator>>(std::istream &is, Shape &shape) {
+  // get (
+  while (true) {
+    char ch = is.get();
+    if (ch == '(') break;
+    if (!isspace(ch)) {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  index_t idx;
+  std::vector<index_t> tmp;
+  while (is >> idx) {
+    tmp.push_back(idx);
+    char ch;
+    do {
+      ch = is.get();
+    } while (isspace(ch));
+    if (ch == ',') {
+      while (true) {
+        ch = is.peek();
+        if (isspace(ch)) {
+          is.get(); continue;
+        }
+        if (ch == ')') {
+          is.get(); break;
+        }
+        break;
+      }
+      if (ch == ')') break;
+    } else if (ch == ')') {
+      break;
+    } else {
+      is.setstate(std::ios::failbit);
+      return is;
+    }
+  }
+  shape.CopyFrom(tmp.begin(), tmp.end());
+  return is;
+}
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_SHAPE_H_
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
new file mode 100644
index 000000000000..03a8409f8087
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -0,0 +1,257 @@
+/*!
+*  Copyright (c) 2016 by Contributors
+* \file symbol.h
+* \brief definition of symbol
+* \author Chuntao Hong, Zhang Chen
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_SYMBOL_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_SYMBOL_H_
+
+#include <map>
+#include <string>
+#include <vector>
+#include "mxnet-cpp/base.h"
+#include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/op_map.h"
+
+namespace mxnet {
+namespace cpp {
+
+class Executor;
+
+/*!
+* \brief struct to store SymbolHandle
+*/
+struct SymBlob {
+ public:
+  /*!
+  * \brief default constructor
+  */
+  SymBlob() : handle_(nullptr) {}
+  /*!
+  * \brief construct with SymbolHandle to store
+  */
+  explicit SymBlob(SymbolHandle handle) : handle_(handle) {}
+  /*!
+  * \brief destructor, free the SymbolHandle
+  */
+  ~SymBlob() { MXSymbolFree(handle_); }
+  /*!
+  * \brief the SymbolHandle to store
+  */
+  SymbolHandle handle_;
+
+ private:
+  SymBlob(const SymBlob &);
+  SymBlob &operator=(const SymBlob &);
+};
+
+/*!
+* \brief Symbol interface
+*/
+class Symbol {
+ public:
+  Symbol() {}
+  /*!
+  * \brief construct a Symbol with SymbolHandle
+  * \param handle the given SymbolHandle
+  */
+  explicit Symbol(SymbolHandle handle);
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  explicit Symbol(const char *name);
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  explicit Symbol(const std::string &name);
+  Symbol operator+(const Symbol &rhs) const;
+  Symbol operator-(const Symbol &rhs) const;
+  Symbol operator*(const Symbol &rhs) const;
+  Symbol operator/(const Symbol &rhs) const;
+
+  Symbol operator+(mx_float scalar) const;
+  Symbol operator-(mx_float scalar) const;
+  Symbol operator*(mx_float scalar) const;
+  Symbol operator/(mx_float scalar) const;
+  Symbol Copy() const;
+  /*!
+  * \brief construct a variable Symbol
+  * \param name the name of the variable
+  */
+  static Symbol Variable(const std::string &name = "");
+  Symbol operator[](int index);
+  Symbol operator[](const std::string &index);
+  /*!
+  * \brief Create a symbol that groups symbols together
+  * \param symbols List of symbols to be groupe
+  */
+  static Symbol Group(const std::vector<Symbol> &symbols);
+  /*!
+  * \brief load Symbol from a JSON file
+  * \param file_name the name of the file
+  */
+  static Symbol Load(const std::string &file_name);
+  /*!
+  * \brief load Symbol from a JSON string
+  * \param json_str the JSON string
+  */
+  static Symbol LoadJSON(const std::string &json_str);
+  /*!
+  * \brief save Symbol to a file
+  * \param file_name the name of the file
+  */
+  void Save(const std::string &file_name) const;
+  /*!
+  * \brief save Symbol into a JSON string
+  */
+  std::string ToJSON() const;
+  /*!
+  * \brief save Symbol into a JSON string
+  * \retutrn the symbol whose outputs are all the internals.
+  */
+  Symbol GetInternals() const;
+  /*!
+  * \return the SymbolHandle
+  */
+  SymbolHandle GetHandle() const { return blob_ptr_->handle_; }
+  /*!
+  * \brief construct an operator Symbol, with given input Symbol and config
+  * \param name the name of the Symbol
+  * \param input_keys the vector of keys of the input
+  * \param input_values the vector of the intput Symbols
+  * \param config_keys the vector of keys of the config
+  * \param config_values the vecotr of values of the config
+  */
+  Symbol(const std::string &operator_name, const std::string &name,
+         std::vector<const char *> input_keys,
+         std::vector<SymbolHandle> input_values,
+         std::vector<const char *> config_keys,
+         std::vector<const char *> config_values);
+  /*!
+  * \brief infer the shapes by providing shapes of known argument shapes.
+  * \param arg_shapes map of argument name to shape of arguments with known
+  * shapes.
+  * \param in_shapes used to store infered shapes of input arguments.
+  * \param out_shapes used to store infered shapes of outputs.
+  * \param aux_shapes use to store the infered shapes of auxiliary states
+  */
+  void InferShape(
+      const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
+      std::vector<std::vector<mx_uint> > *in_shape,
+      std::vector<std::vector<mx_uint> > *aux_shape,
+      std::vector<std::vector<mx_uint> > *out_shape) const;
+  /*!
+  * \brief List the arguments names.
+  *
+  * The position of the returned list also corresponds to calling position in
+  *operator()
+  * \return the arguments list of this symbol, they can be either named or
+  *unnamed (empty string).
+  */
+  std::vector<std::string> ListArguments() const;
+  /*! \return get the descriptions of outputs for this symbol */
+  std::vector<std::string> ListOutputs() const;
+  /*! \return get the descriptions of auxiliary data for this symbol */
+  std::vector<std::string> ListAuxiliaryStates() const;
+  /*!
+  * \brief infer and construct all the arrays to bind to executor by providing
+  * some known arrays.
+  * \param context the context of all the infered arrays
+  * \param arg_arrays infered input arguments arrays.
+  * \param arad_arrays infered arrays to store the gradient output of the input
+  * arguments.
+  * \param aux_arrays infered arrays that is used as internal state in op.
+  * \param args_map map of some given arguments arrays.
+  * \param args_grad_store map of some gradient given store arrays.
+  * \param args_req_type map of some given type of gradient saving. Can only be
+  * in {kNullOp, kAddTo, kWriteTo}.
+  * \param aux_map NDArray that stores the internal state in op
+  */
+  void InferExecutorArrays(
+      const Context &context, std::vector<NDArray> *arg_arrays,
+      std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+      std::vector<NDArray> *aux_arrays,
+      const std::map<std::string, NDArray> &args_map,
+      const std::map<std::string, NDArray> &arg_grad_store =
+          std::map<std::string, NDArray>(),
+      const std::map<std::string, OpReqType> &grad_req_type =
+          std::map<std::string, OpReqType>(),
+      const std::map<std::string, NDArray> &aux_map =
+          std::map<std::string, NDArray>()) const;
+  /*!
+  * \brief infer and construct all the input arguments arrays to bind to
+  * executor by providing some known arguments arrays.
+  * \param context the context of all the infered arrays.
+  * \param args_map map of all the infered input arguments arrays.
+  * \param known_args map of some given arguments arrays.
+  */
+  void InferArgsMap(const Context &context,
+                    std::map<std::string, NDArray> *args_map,
+                    const std::map<std::string, NDArray> &known_args) const;
+  /*!
+  * \brief Create an executor by bind symbol with context and arguments.
+  *  If user do not want to compute the gradients of i-th argument,
+  *grad_req_type[i] can be kNullOp.
+  *  The input arrays in the given maps should have the same name with the input
+  *symbol.
+  *  Only need some of the necessary arrays, and the other arrays can be infered
+  *automatically.
+  *
+  * \param context the context of binding.
+  * \param args_map the NDArray that stores the input arguments to the symbol.
+  * \param arg_grad_store NDArray that is used to store the gradient output of
+  *the input arguments.
+  * \param grad_req_type requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_map NDArray that stores the internal state in op
+  * \return a new executor, which need to be free manually.
+  */
+  Executor *SimpleBind(const Context &context,
+                       const std::map<std::string, NDArray> &args_map,
+                       const std::map<std::string, NDArray> &arg_grad_store =
+                           std::map<std::string, NDArray>(),
+                       const std::map<std::string, OpReqType> &grad_req_type =
+                           std::map<std::string, OpReqType>(),
+                       const std::map<std::string, NDArray> &aux_map =
+                           std::map<std::string, NDArray>());
+  /*!
+  * \brief Create an executor by bind symbol with context and arguments.
+  *  If user do not want to compute the gradients of i-th argument,
+  *grad_req_type[i] can be kNullOp.
+  *
+  * \param context the context of binding.
+  * \param arg_arrays the NDArray that stores the input arguments to the symbol.
+  * \param grad_arrays NDArray that is used to store the gradient output of the
+  *input arguments.
+  * \param grad_reqs requirment type of gradient saving. Can only be in
+  *{kNullOp, kAddTo, kWriteTo}.
+  * \param aux_arrays NDArray that is used as internal state in op
+  * \param group_to_ctx dict of string to mx.Context
+  * \param shared_exec Executor to share memory with. This is intended for
+  *runtime reshaping, variable length sequencesn etc.  The returned executor
+  *shares state with shared_exec, and should not be used in parallel with it.
+  * \return a new executor, which need to be free manually.
+  */
+  Executor *Bind(const Context &context, const std::vector<NDArray> &arg_arrays,
+                 const std::vector<NDArray> &grad_arrays,
+                 const std::vector<OpReqType> &grad_reqs,
+                 const std::vector<NDArray> &aux_arrays,
+                 const std::map<std::string, Context> &group_to_ctx =
+                     std::map<std::string, Context>(),
+                 Executor *shared_exec = nullptr);
+
+ private:
+  std::shared_ptr<SymBlob> blob_ptr_;
+  static OpMap*& op_map();
+};
+Symbol operator+(mx_float lhs, const Symbol &rhs);
+Symbol operator-(mx_float lhs, const Symbol &rhs);
+Symbol operator*(mx_float lhs, const Symbol &rhs);
+Symbol operator/(mx_float lhs, const Symbol &rhs);
+}  // namespace cpp
+}  // namespace mxnet
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_SYMBOL_H_
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
new file mode 100644
index 000000000000..a2ab9cb87f30
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -0,0 +1,342 @@
+/*!
+ *  Copyright (c) 2016 by Contributors
+ * \file symbol.hpp
+ * \brief implementation of the symbol
+ * \author Zhang Chen, Chuntao Hong
+ */
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_SYMBOL_HPP_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_SYMBOL_HPP_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dmlc/logging.h"
+#include "mxnet-cpp/symbol.h"
+
+#include "mxnet-cpp/op_suppl.h"
+
+namespace mxnet {
+namespace cpp {
+inline OpMap*& Symbol::op_map() {
+  static OpMap* op_map_ = new OpMap();
+  return op_map_;
+}
+inline Symbol::Symbol(SymbolHandle handle) {
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+inline Symbol::Symbol(const char *name) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateVariable(name, &(handle)), 0);
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+inline Symbol::Symbol(const std::string &name) : Symbol(name.c_str()) {}
+inline Symbol Symbol::Variable(const std::string &name) { return Symbol(name); }
+inline Symbol Symbol::operator+(const Symbol &rhs) const { return _Plus(*this, rhs); }
+inline Symbol Symbol::operator-(const Symbol &rhs) const { return _Minus(*this, rhs); }
+inline Symbol Symbol::operator*(const Symbol &rhs) const { return _Mul(*this, rhs); }
+inline Symbol Symbol::operator/(const Symbol &rhs) const { return _Div(*this, rhs); }
+inline Symbol Symbol::operator+(mx_float scalar) const {
+  return _PlusScalar(*this, scalar);
+}
+inline Symbol Symbol::operator-(mx_float scalar) const {
+  return _MinusScalar(*this, scalar);
+}
+inline Symbol Symbol::operator*(mx_float scalar) const {
+  return _MulScalar(*this, scalar);
+}
+inline Symbol Symbol::operator/(mx_float scalar) const {
+  return _DivScalar(*this, scalar);
+}
+inline Symbol Symbol::operator[](int index) {
+  SymbolHandle out;
+  MXSymbolGetOutput(GetHandle(), index, &out);
+  return Symbol(out);
+}
+inline Symbol Symbol::operator[](const std::string &index) {
+  auto outputs = ListOutputs();
+  for (mx_uint i = 0; i < outputs.size(); ++i) {
+    if (outputs[i] == index) {
+      return (*this)[i];
+    }
+  }
+  LOG(FATAL) << "Cannot find output that matches name " << index;
+  return (*this)[0];
+}
+inline Symbol Symbol::Group(const std::vector<Symbol> &symbols) {
+  SymbolHandle out;
+  std::vector<SymbolHandle> handle_list;
+  for (const auto &t : symbols) {
+    handle_list.push_back(t.GetHandle());
+  }
+  MXSymbolCreateGroup(handle_list.size(), handle_list.data(), &out);
+  return Symbol(out);
+}
+inline Symbol Symbol::Load(const std::string &file_name) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateFromFile(file_name.c_str(), &(handle)), 0);
+  return Symbol(handle);
+}
+inline Symbol Symbol::LoadJSON(const std::string &json_str) {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCreateFromJSON(json_str.c_str(), &(handle)), 0);
+  return Symbol(handle);
+}
+inline void Symbol::Save(const std::string &file_name) const {
+  CHECK_EQ(MXSymbolSaveToFile(GetHandle(), file_name.c_str()), 0);
+}
+inline std::string Symbol::ToJSON() const {
+  const char *out_json;
+  CHECK_EQ(MXSymbolSaveToJSON(GetHandle(), &out_json), 0);
+  return std::string(out_json);
+}
+inline Symbol Symbol::GetInternals() const {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolGetInternals(GetHandle(), &handle), 0);
+  return Symbol(handle);
+}
+inline Symbol::Symbol(const std::string &operator_name, const std::string &name,
+               std::vector<const char *> input_keys,
+               std::vector<SymbolHandle> input_values,
+               std::vector<const char *> config_keys,
+               std::vector<const char *> config_values) {
+  SymbolHandle handle;
+  AtomicSymbolCreator creator = op_map()->GetSymbolCreator(operator_name);
+  MXSymbolCreateAtomicSymbol(creator, config_keys.size(), config_keys.data(),
+                             config_values.data(), &handle);
+  MXSymbolCompose(handle, operator_name.c_str(), input_keys.size(),
+                  input_keys.data(), input_values.data());
+  blob_ptr_ = std::make_shared<SymBlob>(handle);
+}
+
+inline Symbol Symbol::Copy() const {
+  SymbolHandle handle;
+  CHECK_EQ(MXSymbolCopy(GetHandle(), &handle), 0);
+  return Symbol(handle);
+}
+
+inline std::vector<std::string> Symbol::ListArguments() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListArguments(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+inline std::vector<std::string> Symbol::ListOutputs() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListOutputs(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+inline std::vector<std::string> Symbol::ListAuxiliaryStates() const {
+  std::vector<std::string> ret;
+  mx_uint size;
+  const char **sarr;
+  MXSymbolListAuxiliaryStates(GetHandle(), &size, &sarr);
+  for (mx_uint i = 0; i < size; ++i) {
+    ret.push_back(std::string(sarr[i]));
+  }
+  return ret;
+}
+
+inline void Symbol::InferShape(
+    const std::map<std::string, std::vector<mx_uint> > &arg_shapes,
+    std::vector<std::vector<mx_uint> > *in_shape,
+    std::vector<std::vector<mx_uint> > *aux_shape,
+    std::vector<std::vector<mx_uint> > *out_shape) const {
+
+  std::vector<const char *> keys;
+  std::vector<mx_uint> arg_ind_ptr;
+  std::vector<mx_uint> arg_shape_data;
+
+  for (const auto &arg : arg_shapes) {
+    keys.push_back(arg.first.c_str());
+    arg_ind_ptr.push_back(arg_shape_data.size());
+    for (auto i : arg.second) {
+      arg_shape_data.push_back(i);
+    }
+  }
+  arg_ind_ptr.push_back(arg_shape_data.size());
+
+  mx_uint in_shape_size;
+  const mx_uint *in_shape_ndim;
+  const mx_uint **in_shape_data;
+  mx_uint out_shape_size;
+  const mx_uint *out_shape_ndim;
+  const mx_uint **out_shape_data;
+  mx_uint aux_shape_size;
+  const mx_uint *aux_shape_ndim;
+  const mx_uint **aux_shape_data;
+  int complete;
+
+  CHECK_EQ(MXSymbolInferShape(GetHandle(), keys.size(), keys.data(),
+                              arg_ind_ptr.data(), arg_shape_data.data(),
+                              &in_shape_size, &in_shape_ndim, &in_shape_data,
+                              &out_shape_size, &out_shape_ndim, &out_shape_data,
+                              &aux_shape_size, &aux_shape_ndim, &aux_shape_data,
+                              &complete),
+           0);
+
+  if (complete) {
+    for (mx_uint i = 0; i < in_shape_size; ++i) {
+      in_shape->push_back(std::vector<mx_uint>());
+      for (mx_uint j = 0; j < in_shape_ndim[i]; ++j) {
+        (*in_shape)[i].push_back(in_shape_data[i][j]);
+      }
+    }
+    for (mx_uint i = 0; i < aux_shape_size; ++i) {
+      aux_shape->push_back(std::vector<mx_uint>());
+      for (mx_uint j = 0; j < aux_shape_ndim[i]; ++j) {
+        (*aux_shape)[i].push_back(aux_shape_data[i][j]);
+      }
+    }
+    for (mx_uint i = 0; i < out_shape_size; ++i) {
+      out_shape->push_back(std::vector<mx_uint>());
+      for (mx_uint j = 0; j < out_shape_ndim[i]; ++j) {
+        (*out_shape)[i].push_back(out_shape_data[i][j]);
+      }
+    }
+  }
+}
+
+inline void Symbol::InferExecutorArrays(
+    const Context &context, std::vector<NDArray> *arg_arrays,
+    std::vector<NDArray> *grad_arrays, std::vector<OpReqType> *grad_reqs,
+    std::vector<NDArray> *aux_arrays,
+    const std::map<std::string, NDArray> &args_map,
+    const std::map<std::string, NDArray> &arg_grad_store,
+    const std::map<std::string, OpReqType> &grad_req_type,
+    const std::map<std::string, NDArray> &aux_map) const {
+
+  const auto arg_name_list = ListArguments();
+  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+
+  for (const auto &arg_name : arg_name_list) {
+    auto iter = args_map.find(arg_name);
+    if (iter != args_map.end()) {
+      arg_shapes[arg_name] = iter->second.GetShape();
+    }
+  }
+
+  InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = args_map.find(arg_name);
+    if (iter_arg != args_map.end()) {
+      arg_arrays->push_back(iter_arg->second);
+    } else {
+      arg_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &arg_arrays->back());
+    }
+    auto iter_grad = arg_grad_store.find(arg_name);
+    if (iter_grad != arg_grad_store.end()) {
+      grad_arrays->push_back(iter_grad->second);
+    } else {
+      grad_arrays->push_back(NDArray(shape, context, false));
+    }
+    auto iter_req = grad_req_type.find(arg_name);
+    if (iter_req != grad_req_type.end()) {
+      grad_reqs->push_back(iter_req->second);
+    } else if (arg_name.rfind("data") == arg_name.length() - 4
+            || arg_name.rfind("label") == arg_name.length() - 5) {
+      grad_reqs->push_back(OpReqType::kNullOp);
+    } else {
+      grad_reqs->push_back(OpReqType::kWriteTo);
+    }
+  }
+
+  const auto aux_name_list = ListAuxiliaryStates();
+  for (size_t i = 0; i < aux_shapes.size(); ++i) {
+    const auto &shape = aux_shapes[i];
+    const auto &aux_name = aux_name_list[i];
+    auto iter_aux = aux_map.find(aux_name);
+    if (iter_aux != aux_map.end()) {
+      aux_arrays->push_back(iter_aux->second);
+    } else {
+      aux_arrays->push_back(NDArray(shape, context, false));
+      NDArray::SampleGaussian(0, 1, &aux_arrays->back());
+    }
+  }
+}
+inline void Symbol::InferArgsMap(
+    const Context &context, std::map<std::string, NDArray> *args_map,
+    const std::map<std::string, NDArray> &known_args) const {
+
+  const auto arg_name_list = ListArguments();
+  std::vector<std::vector<mx_uint> > in_shapes, aux_shapes, out_shapes;
+  std::map<std::string, std::vector<mx_uint> > arg_shapes;
+
+  for (const auto &arg_name : arg_name_list) {
+    auto iter = known_args.find(arg_name);
+    if (iter != known_args.end()) {
+      arg_shapes[arg_name] = iter->second.GetShape();
+    }
+  }
+
+  InferShape(arg_shapes, &in_shapes, &aux_shapes, &out_shapes);
+
+  for (size_t i = 0; i < in_shapes.size(); ++i) {
+    const auto &shape = in_shapes[i];
+    const auto &arg_name = arg_name_list[i];
+    auto iter_arg = known_args.find(arg_name);
+    if (iter_arg != known_args.end()) {
+      (*args_map)[arg_name] = iter_arg->second;
+    } else {
+      (*args_map)[arg_name] = NDArray(shape, context, false);
+      NDArray::SampleGaussian(0, 1, &(*args_map)[arg_name]);
+    }
+  }
+}
+
+inline Executor *Symbol::SimpleBind(
+    const Context &context, const std::map<std::string, NDArray> &args_map,
+    const std::map<std::string, NDArray> &arg_grad_store,
+    const std::map<std::string, OpReqType> &grad_req_type,
+    const std::map<std::string, NDArray> &aux_map) {
+  std::vector<NDArray> arg_arrays;
+  std::vector<NDArray> grad_arrays;
+  std::vector<OpReqType> grad_reqs;
+  std::vector<NDArray> aux_arrays;
+
+  InferExecutorArrays(context, &arg_arrays, &grad_arrays, &grad_reqs,
+                      &aux_arrays, args_map, arg_grad_store, grad_req_type,
+                      aux_map);
+
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays);
+}
+
+inline Executor *Symbol::Bind(const Context &context,
+                       const std::vector<NDArray> &arg_arrays,
+                       const std::vector<NDArray> &grad_arrays,
+                       const std::vector<OpReqType> &grad_reqs,
+                       const std::vector<NDArray> &aux_arrays,
+                       const std::map<std::string, Context> &group_to_ctx,
+                       Executor *shared_exec) {
+  return new Executor(*this, context, arg_arrays, grad_arrays, grad_reqs,
+                      aux_arrays, group_to_ctx, shared_exec);
+}
+inline Symbol operator+(mx_float lhs, const Symbol &rhs) { return rhs + lhs; }
+inline Symbol operator-(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RMinusScalar(lhs, rhs);
+}
+inline Symbol operator*(mx_float lhs, const Symbol &rhs) { return rhs * lhs; }
+inline Symbol operator/(mx_float lhs, const Symbol &rhs) {
+  return mxnet::cpp::_RDivScalar(lhs, rhs);
+}
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_SYMBOL_HPP_
diff --git a/cpp-package/scripts/lint.py b/cpp-package/scripts/lint.py
new file mode 100644
index 000000000000..89492eda4d82
--- /dev/null
+++ b/cpp-package/scripts/lint.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# pylint: disable=protected-access, unused-variable, locally-disabled, redefined-variable-type
+"""Lint helper to generate lint summary of source.
+Copyright by Contributors
+"""
+import codecs
+import sys
+import re
+import os
+import cpplint
+from cpplint import _cpplint_state
+from pylint import epylint
+
+CXX_SUFFIX = set(['cc', 'c', 'cpp', 'h', 'cu', 'hpp'])
+PYTHON_SUFFIX = set(['py'])
+
+class LintHelper(object):
+    """Class to help runing the lint and records summary"""
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = len([x for k, x in result_map.iteritems() if len(x) == 0])
+        strm.write('=====%d/%d %s files passed check=====\n' % (npass, len(result_map), ftype))
+        for fname, emap in result_map.iteritems():
+            if len(emap) == 0:
+                continue
+            strm.write('%s: %d Errors of %d Categories map=%s\n' % (
+                fname, sum(emap.values()), len(emap), str(emap)))
+        return len(result_map) - npass
+
+    def __init__(self):
+        self.project_name = None
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+        pylint_disable = ['superfluous-parens',
+                          'too-many-instance-attributes',
+                          'too-few-public-methods']
+        # setup pylint
+        self.pylint_opts = ['--extension-pkg-whitelist=numpy',
+                            '--disable=' + ','.join(pylint_disable)]
+
+        self.pylint_cats = set(['error', 'warning', 'convention', 'refactor'])
+        # setup cpp lint
+        cpplint_args = ['.', '--extensions=' + (','.join(CXX_SUFFIX))]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(','.join(['-build/c++11',
+                                      '-build/namespaces',
+                                      '-build/include',
+                                      '-build/header_guard',
+                                      '+build/include_what_you_use',
+                                      '+build/include_order']))
+        cpplint._SetCountingStyle('toplevel')
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == 'h':
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    def process_python(self, path):
+        """Process a python file."""
+        (pylint_stdout, pylint_stderr) = epylint.py_run(
+            ' '.join([str(path)] + self.pylint_opts), return_std=True)
+        emap = {}
+        print pylint_stderr.read()
+        for line in pylint_stdout:
+            sys.stderr.write(line)
+            key = line.split(':')[-1].split('(')[0].strip()
+            if key not in self.pylint_cats:
+                continue
+            if key not in emap:
+                emap[key] = 1
+            else:
+                emap[key] += 1
+        sys.stderr.write('\n')
+        self.python_map[str(path)] = emap
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += LintHelper._print_summary_map(strm, self.cpp_header_map, 'cpp-header')
+        nerr += LintHelper._print_summary_map(strm, self.cpp_src_map, 'cpp-soruce')
+        nerr += LintHelper._print_summary_map(strm, self.python_map, 'python')
+        if nerr == 0:
+            strm.write('All passed!\n')
+        else:
+            strm.write('%d files failed lint\n' % nerr)
+        return nerr
+
+# singleton helper for lint check
+_HELPER = LintHelper()
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ['include', 'api', 'wrapper']
+
+    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find('src/')
+        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
+    else:
+        for spath in inc_list:
+            prefix = spath + os.sep
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub('^' + prefix, '', file_path_from_root)
+                break
+    return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    # HACK: ignore op.h which is automatically generated
+    if fname.endswith('op.h'):
+      return
+    arr = fname.rsplit('.', 1)
+    if fname.find('#') != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+    if arr[-1] in PYTHON_SUFFIX:
+        _HELPER.process_python(fname)
+
+def main():
+    """Main entry function."""
+    if len(sys.argv) < 3:
+        print('Usage: <project-name> <filetype> <list-of-path to traverse>')
+        print('\tfiletype can be python/cpp/all')
+        exit(-1)
+    _HELPER.project_name = sys.argv[1]
+    file_type = sys.argv[2]
+    allow_type = []
+    if file_type == 'python' or file_type == 'all':
+        allow_type += [x for x in PYTHON_SUFFIX]
+    if file_type == 'cpp' or file_type == 'all':
+        allow_type += [x for x in CXX_SUFFIX]
+    allow_type = set(allow_type)
+    if os.name != 'nt':
+        sys.stderr = codecs.StreamReaderWriter(sys.stderr,
+                                               codecs.getreader('utf8'),
+                                               codecs.getwriter('utf8'),
+                                               'replace')
+    for path in sys.argv[3:]:
+        if os.path.isfile(path):
+            process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    process(os.path.join(root, name), allow_type)
+
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+if __name__ == '__main__':
+    main()
diff --git a/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
new file mode 100644
index 000000000000..b1fce38d5738
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.py
@@ -0,0 +1,401 @@
+﻿# -*- coding: utf-8 -*-
+from ctypes import *
+from ctypes.util import find_library
+import os
+import logging
+import platform
+import re
+import sys
+import tempfile
+
+class EnumType:
+    name = ''
+    enumValues = []
+    def __init__(self, typeName = 'ElementWiseOpType', \
+                 typeString = "{'avg', 'max', 'sum'}"):
+        self.name = typeName
+        if (typeString[0] == '{'):  # is a enum type
+            isEnum = True
+            # parse enum
+            self.enumValues = typeString[typeString.find('{') + 1:typeString.find('}')].split(',')
+            for i in range(0, len(self.enumValues)):
+                self.enumValues[i] = self.enumValues[i].strip().strip("'")
+        else:
+            logging.warn("trying to parse none-enum type as enum: %s" % typeString)
+    def GetDefinitionString(self, indent = 0):
+        indentStr = ' ' * indent
+        ret = indentStr + 'enum class %s {\n' % self.name
+        for i in range(0, len(self.enumValues)):
+            ret = ret + indentStr + '  %s = %d' % (self.enumValues[i], i)
+            if (i != len(self.enumValues) -1):
+                ret = ret + ","
+            ret = ret + "\n"
+        ret = ret + "};\n"
+        return ret
+    def GetDefaultValueString(self, value = ''):
+        return self.name + "::" + value
+    def GetEnumStringArray(self, indent = 0):
+        indentStr = ' ' * indent
+        ret = indentStr + 'static const char *%sValues[] = {\n' % self.name
+        for i in range(0, len(self.enumValues)):
+            ret = ret + indentStr + '  "%s"' % self.enumValues[i]
+            if (i != len(self.enumValues) -1):
+                ret = ret + ","
+            ret = ret + "\n"
+        ret = ret + indentStr + "};\n"
+        return ret
+    def GetConvertEnumVariableToString(self, variable=''):
+        return "%sValues[int(%s)]" % (self.name, variable)
+
+
+class Arg:
+    typeDict = {'boolean':'bool',\
+        'Shape(tuple)':'Shape',\
+        'Symbol':'Symbol',\
+        'NDArray':'Symbol',\
+        'ndarray-or-symbol':'Symbol',\
+        'Symbol[]':'const std::vector<Symbol>&',\
+        'Symbol or Symbol[]':'const std::vector<Symbol>&',\
+        'NDArray[]':'const std::vector<Symbol>&',\
+        'ndarray-or-symbol[]':'const std::vector<Symbol>&',\
+        'caffe-layer-parameter':'::caffe::LayerParameter',\
+        'float':'mx_float',\
+        'real_t':'mx_float',\
+        'int':'int',\
+        'int (non-negative)': 'uint32_t',\
+        'long (non-negative)': 'uint64_t',\
+        'int or None':'dmlc::optional<int>',\
+        'long':'int64_t',\
+        'double':'double',\
+        'string':'const std::string&'}
+    name = ''
+    type = ''
+    description = ''
+    isEnum = False
+    enum = None
+    hasDefault = False
+    defaultString = ''
+    def __init__(self, opName = '', argName = '', typeString = '', descString = ''):
+        self.name = argName
+        self.description = descString
+        if (typeString[0] == '{'):  # is enum type
+            self.isEnum = True
+            self.enum = EnumType(self.ConstructEnumTypeName(opName, argName), typeString)
+            self.type = self.enum.name
+        else:
+            try:
+                self.type = self.typeDict[typeString.split(',')[0]]
+            except:
+                print 'argument "%s" of operator "%s" has unknown type "%s"' % (argName, opName, typeString)
+                pass
+        if typeString.find('default=') != -1:
+            self.hasDefault = True
+            self.defaultString = typeString.split('default=')[1].strip().strip("'")
+            if typeString.startswith('string'):
+                self.defaultString = self.MakeCString(self.defaultString)
+            elif self.isEnum:
+                self.defaultString = self.enum.GetDefaultValueString(self.defaultString)
+            elif self.defaultString == 'None':
+                self.defaultString = self.type + '()'
+            elif self.defaultString == 'False':
+                self.defaultString = 'false'
+            elif self.defaultString == 'True':
+                self.defaultString = 'true'
+            elif self.defaultString[0] == '(':
+                self.defaultString = 'Shape' + self.defaultString
+            elif self.type == 'dmlc::optional<int>':
+                self.defaultString = self.type + '(' + self.defaultString + ')'
+            elif typeString.startswith('caffe-layer-parameter'):
+                self.defaultString = 'textToCaffeLayerParameter(' + self.MakeCString(self.defaultString) + ')'
+                hasCaffe = True
+
+    def MakeCString(self, str):
+        str = str.replace('\n', "\\n")
+        str = str.replace('\t', "\\t")
+        return '\"' + str + '\"'
+
+    def ConstructEnumTypeName(self, opName = '', argName = ''):
+        a = opName[0].upper()
+        # format ArgName so instead of act_type it returns ActType
+        argNameWords = argName.split('_')
+        argName = ''
+        for an in argNameWords:
+            argName = argName + an[0].upper() + an[1:]
+        typeName = a + opName[1:] + argName
+        return typeName
+
+class Op:
+    name = ''
+    description = ''
+    args = []
+
+    def __init__(self, name = '', description = '', args = []):
+        self.name = name
+        self.description = description
+        # add a 'name' argument
+        nameArg = Arg(self.name, \
+                      'symbol_name', \
+                      'string', \
+                      'name of the resulting symbol')
+        args.insert(0, nameArg)
+        # reorder arguments, put those with default value to the end
+        orderedArgs = []
+        for arg in args:
+            if not arg.hasDefault:
+                orderedArgs.append(arg)
+        for arg in args:
+            if arg.hasDefault:
+                orderedArgs.append(arg)
+        self.args = orderedArgs
+
+    def WrapDescription(self, desc = ''):
+        ret = []
+        sentences = desc.split('.')
+        lines = desc.split('\n')
+        for line in lines:
+          line = line.strip()
+          if len(line) <= 80:
+            ret.append(line.strip())
+          else:
+            while len(line) > 80:
+              pos = line.rfind(' ', 0, 80)+1
+              if pos <= 0:
+                pos = line.find(' ')
+              if pos < 0:
+                pos = len(line)
+              ret.append(line[:pos].strip())
+              line = line[pos:]
+        return ret
+
+    def GenDescription(self, desc = '', \
+                        firstLineHead = ' * \\brief ', \
+                        otherLineHead = ' *        '):
+        ret = ''
+        descs = self.WrapDescription(desc)
+        ret = ret + firstLineHead
+        if len(descs) == 0:
+          return ret.rstrip()
+        ret = (ret + descs[0]).rstrip() + '\n'
+        for i in range(1, len(descs)):
+            ret = ret + (otherLineHead + descs[i]).rstrip() + '\n'
+        return ret
+
+    def GetOpDefinitionString(self, use_name, indent=0):
+        ret = ''
+        indentStr = ' ' * indent
+        # define enums if any
+        for arg in self.args:
+            if arg.isEnum and use_name:
+                # comments
+                ret = ret + self.GenDescription(arg.description, \
+                                        '/*! \\breif ', \
+                                        ' *        ')
+                ret = ret + " */\n"
+                # definition
+                ret = ret + arg.enum.GetDefinitionString(indent) + '\n'
+        # create function comments
+        ret = ret + self.GenDescription(self.description, \
+                                        '/*!\n * \\breif ', \
+                                        ' *        ')
+        for arg in self.args:
+            if arg.name != 'symbol_name' or use_name:
+                ret = ret + self.GenDescription(arg.name + ' ' + arg.description, \
+                                        ' * \\param ', \
+                                        ' *        ')
+        ret = ret + " * \\return new symbol\n"
+        ret = ret + " */\n"
+        # create function header
+        declFirstLine = indentStr + 'inline Symbol %s(' % self.name
+        ret = ret + declFirstLine
+        argIndentStr = ' ' * len(declFirstLine)
+        arg_start = 0 if use_name else 1
+        if len(self.args) > arg_start:
+            ret = ret + self.GetArgString(self.args[arg_start])
+        for i in range(arg_start+1, len(self.args)):
+            ret = ret + ',\n'
+            ret = ret + argIndentStr + self.GetArgString(self.args[i])
+        ret = ret + ') {\n'
+        # create function body
+        # if there is enum, generate static enum<->string mapping
+        for arg in self.args:
+            if arg.isEnum:
+                ret = ret + arg.enum.GetEnumStringArray(indent + 2)
+        # now generate code
+        ret = ret + indentStr + '  return Operator(\"%s\")\n' % self.name
+        for arg in self.args:   # set params
+            if arg.type == 'Symbol' or \
+                arg.type == 'const std::string&' or \
+                arg.type == 'const std::vector<Symbol>&':
+                continue
+            v = arg.name
+            if arg.isEnum:
+                v = arg.enum.GetConvertEnumVariableToString(v)
+            ret = ret + indentStr + ' ' * 11 + \
+                '.SetParam(\"%s\", %s)\n' % (arg.name, v)
+        #ret = ret[:-1]  # get rid of the last \n
+        symbols = ''
+        inputAlreadySet = False
+        for arg in self.args:   # set inputs
+            if arg.type != 'Symbol':
+                continue
+            inputAlreadySet = True
+            #if symbols != '':
+            #    symbols = symbols + ', '
+            #symbols = symbols + arg.name
+            ret = ret + indentStr + ' ' * 11 + \
+                '.SetInput(\"%s\", %s)\n' % (arg.name, arg.name)
+        for arg in self.args:   # set input arrays vector<Symbol>
+            if arg.type != 'const std::vector<Symbol>&':
+                continue
+            if (inputAlreadySet):
+                logging.error("op %s has both Symbol[] and Symbol inputs!" % self.name)
+            inputAlreadySet = True
+            symbols = arg.name
+            ret = ret + '(%s)\n' % symbols
+        ret = ret + indentStr + ' ' * 11
+        if use_name:
+            ret = ret + '.CreateSymbol(symbol_name);\n'
+        else:
+            ret = ret + '.CreateSymbol();\n'
+        ret = ret + indentStr + '}\n'
+        return ret
+
+    def GetArgString(self, arg):
+        ret = '%s %s' % (arg.type, arg.name)
+        if arg.hasDefault:
+            ret = ret + ' = ' + arg.defaultString
+        return ret
+
+
+def ParseAllOps():
+    """
+    MXNET_DLL int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
+                                                   AtomicSymbolCreator **out_array);
+
+    MXNET_DLL int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator creator,
+                                              const char **name,
+                                              const char **description,
+                                              mx_uint *num_args,
+                                              const char ***arg_names,
+                                              const char ***arg_type_infos,
+                                              const char ***arg_descriptions,
+                                              const char **key_var_num_args);
+    """
+    cdll.libmxnet = cdll.LoadLibrary(sys.argv[1])
+    ListOP = cdll.libmxnet.MXSymbolListAtomicSymbolCreators
+    GetOpInfo = cdll.libmxnet.MXSymbolGetAtomicSymbolInfo
+    ListOP.argtypes=[POINTER(c_int), POINTER(POINTER(c_void_p))]
+    GetOpInfo.argtypes=[c_void_p, \
+        POINTER(c_char_p), \
+        POINTER(c_char_p), \
+        POINTER(c_int), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(POINTER(c_char_p)), \
+        POINTER(c_char_p), \
+        POINTER(c_char_p)
+        ]
+
+    nOps = c_int()
+    opHandlers = POINTER(c_void_p)()
+    r = ListOP(byref(nOps), byref(opHandlers))
+    ret = ''
+    ret2 = ''
+    for i in range(0, nOps.value):
+        handler = opHandlers[i]
+        name = c_char_p()
+        description = c_char_p()
+        nArgs = c_int()
+        argNames = POINTER(c_char_p)()
+        argTypes = POINTER(c_char_p)()
+        argDescs = POINTER(c_char_p)()
+        varArgName = c_char_p()
+        return_type = c_char_p()
+
+        GetOpInfo(handler, byref(name), byref(description), \
+            byref(nArgs), byref(argNames), byref(argTypes), \
+            byref(argDescs), byref(varArgName), byref(return_type))
+
+        if name.value[0]=='_':     # get rid of functions like __init__
+            continue
+
+        args = []
+
+        for i in range(0, nArgs.value):
+            arg = Arg(name.value,
+                      argNames[i],
+                      argTypes[i],
+                      argDescs[i])
+            args.append(arg)
+
+        op = Op(name.value, description.value, args)
+
+        ret = ret + op.GetOpDefinitionString(True) + "\n"
+        ret2 = ret2 + op.GetOpDefinitionString(False) + "\n"
+    return ret + ret2
+
+if __name__ == "__main__":
+    #et = EnumType(typeName = 'MyET')
+    reload(sys)
+    sys.setdefaultencoding('UTF8')
+    #print(et.GetDefinitionString())
+    #print(et.GetEnumStringArray())
+    #arg = Arg()
+    #print(arg.ConstructEnumTypeName('SoftmaxActivation', 'act_type'))
+    #arg = Arg(opName = 'FullConnected', argName='act_type', \
+    #    typeString="{'elu', 'leaky', 'prelu', 'rrelu'},optional, default='leaky'", \
+    #    descString='Activation function to be applied.')
+    #print(arg.isEnum)
+    #print(arg.defaultString)
+    #arg = Arg("fc", "alpha", "float, optional, default=0.0001", "alpha")
+    #decl = "%s %s" % (arg.type, arg.name)
+    #if arg.hasDefault:
+    #    decl = decl + "=" + arg.defaultString
+    #print(decl)
+
+    temp_file_name = ""
+    output_file = '../../include/mxnet-cpp/op.h'
+    try:
+        # generate file header
+        patternStr = ("/*!\n"
+                      "*  Copyright (c) 2016 by Contributors\n"
+                      "* \\file op.h\n"
+                      "* \\brief definition of all the operators\n"
+                      "* \\author Chuntao Hong, Xin Li\n"
+                      "*/\n"
+                      "\n"
+                      "#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_H_\n"
+                      "#define CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_H_\n"
+                      "\n"
+                      "#include <string>\n"
+                      "#include <vector>\n"
+                      "#include \"mxnet-cpp/base.h\"\n"
+                      "#include \"mxnet-cpp/shape.h\"\n"
+                      "#include \"mxnet-cpp/op_util.h\"\n"
+                      "#include \"mxnet-cpp/operator.h\"\n"
+                      "#include \"dmlc/optional.h\"\n"
+                      "\n"
+                      "namespace mxnet {\n"
+                      "namespace cpp {\n"
+                      "\n"
+                      "%s"
+                      "} //namespace cpp\n"
+                      "} //namespace mxnet\n"
+                      "#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_OP_H_\n")
+
+        # Generate a temporary file name
+        tf = tempfile.NamedTemporaryFile()
+        temp_file_name = tf.name
+        tf.close()
+        with open(temp_file_name, 'w') as f:
+            f.write(patternStr % ParseAllOps())
+
+    except Exception, e:
+      os.remove(output_file)
+      if len(temp_file_name) > 0:
+        os.remove(temp_file_name)
+      raise(e)
+
+    os.system('./move-if-change.sh ' + temp_file_name + ' ' + output_file)
+    pass
+
diff --git a/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj
new file mode 100644
index 000000000000..b2d8448b830d
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.pyproj
@@ -0,0 +1,28 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" DefaultTargets="Build">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{027054bd-8dd3-4d2e-8032-22e339846ed1}</ProjectGuid>
+    <ProjectHome />
+    <StartupFile>OpWrapperGenerator.py</StartupFile>
+    <SearchPath />
+    <WorkingDirectory>.</WorkingDirectory>
+    <OutputPath>.</OutputPath>
+    <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
+    <LaunchProvider>Standard Python launcher</LaunchProvider>
+    <InterpreterId />
+    <InterpreterVersion />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
+  <PropertyGroup Condition="'$(Configuration)' == 'Release'" />
+  <PropertyGroup>
+    <VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
+    <PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="OpWrapperGenerator.py" />
+  </ItemGroup>
+  <Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
+  <Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
+</Project>
\ No newline at end of file
diff --git a/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln
new file mode 100644
index 000000000000..71dc32749769
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/OpWrapperGenerator.sln
@@ -0,0 +1,20 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.40629.0
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "OpWrapperGenerator", "OpWrapperGenerator.pyproj", "{027054BD-8DD3-4D2E-8032-22E339846ED1}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{027054BD-8DD3-4D2E-8032-22E339846ED1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{027054BD-8DD3-4D2E-8032-22E339846ED1}.Release|Any CPU.ActiveCfg = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/cpp-package/src/OpWrapperGenerator/README.md b/cpp-package/src/OpWrapperGenerator/README.md
new file mode 100644
index 000000000000..8fb45ec661f2
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/README.md
@@ -0,0 +1 @@
+## This is a python script that generates operator wrappers such as FullyConnected, based on current libmxnet.dll. This script is written so that we don't need to write new operator wrappers when new ones are added to the library.
diff --git a/cpp-package/src/OpWrapperGenerator/move-if-change.sh b/cpp-package/src/OpWrapperGenerator/move-if-change.sh
new file mode 100755
index 000000000000..c475fae5a847
--- /dev/null
+++ b/cpp-package/src/OpWrapperGenerator/move-if-change.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+if [ -z "$1" ] || [ -z "$2" ]; then
+    echo "Usage: $0 <source file> <destination file>"
+fi
+
+if [ ! -f "$2" ]; then
+    mv -v "$1" "$2"
+    exit 0
+fi
+
+diff "$1" "$2" >/dev/null
+
+if [ $? -ne 0 ]; then
+    mv -v "$1" "$2"
+else
+    rm -f "$1"
+fi
+
diff --git a/cpp-package/tests/travis/run_test.sh b/cpp-package/tests/travis/run_test.sh
new file mode 100755
index 000000000000..27506584f40c
--- /dev/null
+++ b/cpp-package/tests/travis/run_test.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+if [ ${TASK} == "lint" ]; then
+    make lint || exit -1
+    echo "Check documentations of c++ code..."
+    make doc 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
+    echo "---------Error Log----------"
+    cat logclean.txt
+    echo "----------------------------"
+    (cat logclean.txt|grep warning) && exit -1
+    (cat logclean.txt|grep error) && exit -1
+    exit 0
+fi
+
+if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+  # use g++-4.8 in linux
+  export CXX=g++-4.8
+fi
+
+if [ ${TASK} == "build" ]; then
+    make
+    exit $?
+fi
diff --git a/cpp-package/tests/travis/setup.sh b/cpp-package/tests/travis/setup.sh
new file mode 100755
index 000000000000..4238c7654fe4
--- /dev/null
+++ b/cpp-package/tests/travis/setup.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+if [ ${TASK} == "lint" ]; then
+    pip install cpplint 'pylint==1.4.4' 'astroid==1.3.6' --user
+fi
diff --git a/dmlc-core b/dmlc-core
index a79b0df25c42..b5bec5481df8 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit a79b0df25c42b9612dcdfd950d91ce0928a394cd
+Subproject commit b5bec5481df86e8e6728d8bd80a61d87ef3b2cd5
diff --git a/docker/.gitignore b/docker/.gitignore
new file mode 100644
index 000000000000..2a377effe731
--- /dev/null
+++ b/docker/.gitignore
@@ -0,0 +1,2 @@
+Dockerfile.*
+!Dockerfile.in.*
diff --git a/docker/Dockerfiles/Dockerfile.in.julia b/docker/Dockerfiles/Dockerfile.in.julia
new file mode 100644
index 000000000000..42422ddbed54
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.in.julia
@@ -0,0 +1,7 @@
+# -*- mode: dockerfile -*-
+# part of the dockerfile to install the julia binding
+
+COPY install/julia.sh install/
+RUN install/julia.sh
+ENV MXNET_HOME /mxnet
+RUN julia -e 'Pkg.add("MXNet")'
diff --git a/docker/Dockerfiles/Dockerfile.in.lib.cpu b/docker/Dockerfiles/Dockerfile.in.lib.cpu
new file mode 100644
index 000000000000..002e2d1e4209
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.in.lib.cpu
@@ -0,0 +1,10 @@
+# -*- mode: dockerfile -*-
+# dockerfile to build libmxnet.so on CPU
+FROM ubuntu:14.04
+
+COPY install/cpp.sh install/
+RUN install/cpp.sh
+
+RUN git clone --recursive https://github.com/dmlc/mxnet && cd mxnet && \
+    make -j$(nproc) && \
+    rm -r build
diff --git a/docker/Dockerfiles/Dockerfile.in.lib.gpu b/docker/Dockerfiles/Dockerfile.in.lib.gpu
new file mode 100644
index 000000000000..2185babf085c
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.in.lib.gpu
@@ -0,0 +1,9 @@
+# -*- mode: dockerfile -*-
+# dockerfile to build libmxnet.so on GPU
+FROM nvidia/cuda:8.0-cudnn5-devel
+
+COPY install/cpp.sh install/
+RUN install/cpp.sh
+
+RUN git clone --recursive https://github.com/dmlc/mxnet && cd mxnet && \
+    make -j$(nproc) USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
diff --git a/docker/Dockerfiles/Dockerfile.in.python b/docker/Dockerfiles/Dockerfile.in.python
new file mode 100644
index 000000000000..b7979b231d7d
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.in.python
@@ -0,0 +1,6 @@
+# -*- mode: dockerfile -*-
+# part of the dockerfile to install the python binding
+
+COPY install/python.sh install/
+RUN install/python.sh
+ENV PYTHONPATH=/mxnet/python
diff --git a/docker/Dockerfiles/Dockerfile.in.r-lang b/docker/Dockerfiles/Dockerfile.in.r-lang
new file mode 100644
index 000000000000..321094ec6c63
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.in.r-lang
@@ -0,0 +1,7 @@
+# -*- mode: dockerfile -*-
+# part of the dockerfile to install the r binding
+
+COPY install/r.sh install/
+ADD https://raw.githubusercontent.com/dmlc/mxnet/master/R-package/DESCRIPTION  install/
+RUN install/r.sh
+RUN cd mxnet && make rpkg && R CMD INSTALL mxnet_current_r.tar.gz
diff --git a/docker/Dockerfiles/Dockerfile.in.scala b/docker/Dockerfiles/Dockerfile.in.scala
new file mode 100644
index 000000000000..a45fbfdaaab6
--- /dev/null
+++ b/docker/Dockerfiles/Dockerfile.in.scala
@@ -0,0 +1,7 @@
+# -*- mode: dockerfile -*-
+# part of the dockerfile to install the scala binding
+
+COPY install/scala.sh install/
+RUN install/scala.sh
+
+RUN cd mxnet && make scalapkg
diff --git a/docker/README.md b/docker/README.md
index 5b2897ade43f..95fa668e97d7 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,58 +1,113 @@
 # Docker images for MXNET
 
-Pre-built docker images are available at https://hub.docker.com/r/dmlc/mxnet/
-
 ## How to use
 
-1. First pull the pre-built image
+First make sure [docker](https://docs.docker.com/engine/installation/) is
+installed. The docker plugin
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker) is required to run on
+Nvidia GPUs.
+
+Pre-built docker containers are available at https://hub.docker.com/r/mxnet/
+
+For example, the following command launches a container with the Python package
+installed. It will pull the docker images from docker hub if it does not exist
+locally.
+
+```bash
+docker run -ti --rm mxnet/python
+```
+
+Then you can run MXNet in python, e.g.:
+
+```bash
+# python -c 'import mxnet as mx; a = mx.nd.ones((2,3)); print((a*2).asnumpy())'
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+
+If the host machine has at least one GPU installed and `nvidia-docker` is installed, namely
+`nvidia-docker run --rm nvidia/cuda nvidia-smi` runs successfully, then you can
+run a container with GPU supports
+
+```bash
+nvidia-docker run -ti --rm mxnet/python:gpu
+```
+
+Now you can run the above example in `GPU 0`:
+
+```bash
+# python -c 'import mxnet as mx; a = mx.nd.ones((2,3), mx.gpu(0)); print((a*2).asnumpy())'
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
+```
+
+## Hosted containers
+
+All images are based on Ubuntu 14.04. The `gpu` tag is built with CUDA 8.0 and
+cuDNN 5.
+
+### Python
 
-   ```bash
-   docker pull dmlc/mxnet
-   ```
-2. Then we can run the python shell in the docker
+Hosted at https://hub.docker.com/r/mxnet/python/
 
-   ```bash
-   docker run -ti dmlc/mxnet python
-   ```
-   For example
-   ```bash
-   $ docker run -ti dmlc/mxnet python
-   Python 2.7.6 (default, Jun 22 2015, 17:58:13)
-   [GCC 4.8.2] on linux2
-   Type "help", "copyright", "credits" or "license" for more information.
-   >>> import mxnet as mx
-   import mxnet as mx
-   >>> quit()
-   quit()
-   ```
+Python versions: 2.7.12 and 3.5.2.
 
-   Note: One may get the error message `libdc1394 error: Failed to initialize
-   libdc1394`, which is due to opencv and can be ignored.
+Available tags:
 
-3. Train a model on MNIST to check everything works
+- mxnet/python
+- mxnet/python:gpu
 
-   ```
-   docker run dmlc/mxnet python /mxnet/example/image-classification/train_mnist.py
-   ```
+### R
 
-If the host machine has Nvidia GPUs, we can use `dmlc/mxnet:cuda`, which has both CUDA and CUDNN installed.
-To launch the docker, we need to install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) first.
+Hosted at https://hub.docker.com/r/mxnet/r-lang/
 
-1. Pull the image
+R version: 3.3.3
 
-   ```bash
-   docker pull dmlc/mxnet:cuda
-   ```
+Available tags:
 
-2. Train MNIST on GPU 0
+- mxnet/r-lang
+- mxnet/r-lang:gpu
 
-   ```bash
-   nvidia-docker run dmlc/mxnet:cuda python /mxnet/example/image-classification/train_mnist.py --gpus 0
-   ```
+
+### Julia
+
+Hosted at https://hub.docker.com/r/mxnet/julia/
+
+Julia version: 0.5.1
+
+Available tags:
+
+- mxnet/julia
+- mxnet/julia:gpu
+
+#### Scala
+
+Hosted at https://hub.docker.com/r/mxnet/scala/
+
+Scala version: 2.11.8
+
+Available tags:
+
+- mxnet/scala
 
 ## How to build
 
+The following command build the default Python package
+
+```bash
+./tool.sh build python cpu
+```
+
+Run `./tool.sh` for more details. Use
+
+
+Tips: The following commands stop all docker containers and delete all docker images.
+
+```bash
+docker stop $(docker ps -a -q)
+docker rm $(docker ps -a -q)
+```
+
 ```bash
-docker build -t dmlc/mxnet:cpu cpu
-docker build -t dmlc/mxnet:cuda cuda
+docker rmi $(docker images -a -q)
 ```
diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
deleted file mode 100644
index 1e5a956450dc..000000000000
--- a/docker/cpu/Dockerfile
+++ /dev/null
@@ -1,13 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER Mu Li <muli@cs.cmu.edu>
-
-# install the core library
-RUN apt-get update && apt-get install -y build-essential git libopenblas-dev libopencv-dev
-RUN git clone --recursive https://github.com/dmlc/mxnet/ && cd mxnet && \
-    cp make/config.mk . && \
-    echo "USE_BLAS=openblas" >>config.mk && \
-    make -j$(nproc)
-
-# python pakcage
-RUN apt-get install -y python-numpy wget unzip
-ENV PYTHONPATH /mxnet/python
diff --git a/docker/cuda/7.5/Dockerfile b/docker/cuda/7.5/Dockerfile
deleted file mode 100644
index ff0b0bbb2cd6..000000000000
--- a/docker/cuda/7.5/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel
-MAINTAINER Qingsong Liu <liuqs.ustc@gmail.com>
-
-RUN apt-get update && apt-get install -y \
-  git \
-  libopenblas-dev \
-  libopencv-dev \
-  python-dev \
-  python-numpy \
-  python-setuptools \
-  wget \
-  python-pip \
-  unzip
-
-RUN cd /root && git clone --recursive https://github.com/dmlc/mxnet && cd mxnet && \
-  cp make/config.mk . && \
-  sed -i 's/USE_BLAS = atlas/USE_BLAS = openblas/g' config.mk && \
-  sed -i 's/USE_CUDA = 0/USE_CUDA = 1/g' config.mk && \
-  sed -i 's/USE_CUDA_PATH = NONE/USE_CUDA_PATH = \/usr\/local\/cuda/g' config.mk && \
-  sed -i 's/USE_CUDNN = 0/USE_CUDNN = 1/g' config.mk && \
-  make -j"$(nproc)"
-
-ENV PYTHONPATH /root/mxnet/python
-
-WORKDIR /root/mxnet
diff --git a/docker/cuda/8.0/Dockerfile b/docker/cuda/8.0/Dockerfile
deleted file mode 100644
index c67375576f03..000000000000
--- a/docker/cuda/8.0/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-devel
-MAINTAINER Qingsong Liu <liuqs.ustc@gmail.com>
-
-RUN apt-get update && apt-get install -y \
-  git \
-  libopenblas-dev \
-  libopencv-dev \
-  python-dev \
-  python-numpy \
-  python-setuptools \
-  wget \
-  python-pip \
-  unzip
-
-RUN cd /root && git clone --recursive https://github.com/dmlc/mxnet && cd mxnet && \
-  cp make/config.mk . && \
-  sed -i 's/USE_BLAS = atlas/USE_BLAS = openblas/g' config.mk && \
-  sed -i 's/USE_CUDA = 0/USE_CUDA = 1/g' config.mk && \
-  sed -i 's/USE_CUDA_PATH = NONE/USE_CUDA_PATH = \/usr\/local\/cuda/g' config.mk && \
-  sed -i 's/USE_CUDNN = 0/USE_CUDNN = 1/g' config.mk && \
-  make -j"$(nproc)"
-
-ENV PYTHONPATH /root/mxnet/python
-
-WORKDIR /root/mxnet
diff --git a/docker/install/cpp.sh b/docker/install/cpp.sh
new file mode 100755
index 000000000000..91b8b8db0607
--- /dev/null
+++ b/docker/install/cpp.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# libraries for building mxnet c++ core on ubuntu
+
+apt-get update && apt-get install -y \
+    build-essential git libatlas-base-dev libopencv-dev \
+    libcurl4-openssl-dev libgtest-dev cmake wget unzip
+
+cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
diff --git a/docker/install/julia.sh b/docker/install/julia.sh
new file mode 100755
index 000000000000..604a1bc2c234
--- /dev/null
+++ b/docker/install/julia.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's julia package on ubuntu
+
+# the julia version shipped with ubuntu (version 0.4) is too low. so download a
+# new version
+# apt-get install -y julia
+
+wget -q https://julialang.s3.amazonaws.com/bin/linux/x64/0.5/julia-0.5.1-linux-x86_64.tar.gz
+tar -zxf julia-0.5.1-linux-x86_64.tar.gz
+rm julia-0.5.1-linux-x86_64.tar.gz
+ln -s $(pwd)/julia-6445c82d00/bin/julia /usr/bin/julia
diff --git a/docker/install/python.sh b/docker/install/python.sh
new file mode 100755
index 000000000000..0459bb9198c4
--- /dev/null
+++ b/docker/install/python.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's python package on ubuntu
+
+apt-get update && apt-get install -y python-dev python3-dev
+
+# the version of the pip shipped with ubuntu may be too lower, install a recent version here
+cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py
+
+pip2 install nose pylint numpy nose-timer requests
+pip3 install nose pylint numpy nose-timer requests
diff --git a/docker/install/r.sh b/docker/install/r.sh
new file mode 100755
index 000000000000..9351763ddcee
--- /dev/null
+++ b/docker/install/r.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's r package on ubuntu
+
+echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list
+gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
+gpg -a --export E084DAB9 | apt-key add -
+
+apt-get update
+apt-get install -y r-base r-base-dev libxml2-dev libxt-dev libssl-dev
+
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+if [ ! -f "./DESCRIPTION" ]; then
+    cp ../../R-package/DESCRIPTION .
+fi
+
+Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
+Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
diff --git a/docker/install/scala.sh b/docker/install/scala.sh
new file mode 100755
index 000000000000..8cbe91199463
--- /dev/null
+++ b/docker/install/scala.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's scala package on ubuntu
+
+apt-get install -y maven default-jdk
+
+wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
+dpkg -i scala-2.11.8.deb
+rm scala-2.11.8.deb
diff --git a/docker/run.sh b/docker/run.sh
new file mode 100644
index 000000000000..0037ab1926d7
--- /dev/null
+++ b/docker/run.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# Build and push all docker containers
+
+DEVICES=('cpu' 'gpu')
+LANGUAGES=('python' 'julia' 'r-lang' 'scala')
+for DEV in "${DEVICES[@]}"; do
+    for LANG in "${LANGUAGES[@]}"; do
+        ./tool.sh build ${LANG} ${DEV}
+        ./tool.sh push ${LANG} ${DEV}
+    done
+done
diff --git a/docker/tool.sh b/docker/tool.sh
new file mode 100755
index 000000000000..31a98822350d
--- /dev/null
+++ b/docker/tool.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+#
+# Script to build, test and push a docker container
+#
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+function show_usage() {
+    echo ""
+    echo "Usage: $(basename $0) COMMAND DEVICE LANGUAGE "
+    echo ""
+    echo "   COMMAND: build or commit."
+    echo "            commit needs logined in docker hub"
+    echo "   DEVICE: targed device, e.g. cpu, or gpu"
+    echo "   LANGUAGE: the language binding to buld, e.g. python, r-lang, julia, or scala"
+    echo ""
+}
+
+if (( $# < 3 )); then
+    show_usage
+    exit -1
+fi
+
+COMMAND=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift 1
+LANGUAGE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift 1
+DEVICE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift 1
+
+DOCKERFILE_LIB="${SCRIPT_DIR}/Dockerfiles/Dockerfile.in.lib.${DEVICE}"
+if [ ! -e ${DOCKERFILE_LIB} ]; then
+    echo "Error DEVICE=${DEVICE}, failed to find ${DOCKERFILE_LIB}"
+    show_usage
+    exit 1
+fi
+
+DOCKERFILE_LANG="${SCRIPT_DIR}/Dockerfiles/Dockerfile.in.${LANGUAGE}"
+if [ ! -e ${DOCKERFILE_LANG} ]; then
+    echo "Error LANGUAGE=${LANGUAGE}, failed to find ${DOCKERFILE_LANG}"
+    show_usage
+    exit 1
+fi
+
+if [[ "${DEVICE}" == *"gpu"* ]] && [[ "{COMMAND}" == "test" ]]; then
+    DOCKER_BINARY="nvidia-docker"
+else
+    DOCKER_BINARY="docker"
+fi
+
+DOCKER_TAG="mxnet/${LANGUAGE}"
+if [ "${DEVICE}" != 'cpu' ]; then
+    DOCKER_TAG="${DOCKER_TAG}:${DEVICE}"
+fi
+DOCKERFILE="Dockerfile.${LANGUAGE}.${DEVICE}"
+
+# print arguments
+echo "DOCKER_BINARY: ${DOCKER_BINARY}"
+echo "DOCKERFILE: ${DOCKERFILE}"
+echo "DOCKER_TAG: ${DOCKER_TAG}"
+
+if [[ "${COMMAND}" == "build" ]]; then
+    rm -rf ${DOCKERFILE}
+    cp ${DOCKERFILE_LIB} ${DOCKERFILE}
+    cat ${DOCKERFILE_LANG} >>${DOCKERFILE}
+    # To remove the following error caused by opencv
+    #    libdc1394 error: Failed to initialize libdc1394"
+    CMD="sh -c 'ln -s /dev/null /dev/raw1394';"
+    # setup scala classpath
+    if [[ "${LANGUAGE}" == "scala" ]]; then
+        CMD+="CLASSPATH=\${CLASSPATH}:\`ls /mxnet/scala-package/assembly/linux-x86_64-*/target/*.jar | paste -sd \":\"\` "
+    fi
+    echo "CMD ${CMD} bash" >>${DOCKERFILE}
+    ${DOCKER_BINARY} build -t ${DOCKER_TAG} -f ${DOCKERFILE} .
+elif [[ "${COMMAND}" == "push" ]]; then
+    ${DOCKER_BINARY} push ${DOCKER_TAG}
+else
+    echo "Unknow COMMAND=${COMMAND}"
+    show_usage
+    exit 1
+fi
diff --git a/docs/README.md b/docs/README.md
index 173b9a4b1de1..8e88358c8b05 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,50 +1,26 @@
 # MXNet documentation
 
-A built version of document is available at http://mxnet.io
+MXNet's documents can be built by running `make html` in this folder.
 
-## To build the docs with Docker
+A built version of document is available at http://mxnet.io
 
-The `Dockerfile` in this directory encapsulates all the dependencies needed
-to build the docs.  The default entry-point builds the docs and serves them
-through a simple HTTP server for previewing.
+To build the documents locally, the easiest way is by using `docker`. First make
+sure [docker](docker.com) is installed. Then use the following commands to clone and
+build MXNet's documents (not including jupyter notebooks and API documents
+execept for Python):
 
+```bash
+git clone --recursive https://github.com/dmlc/mxnet
+cd mxnet
+tests/ci_build/ci_build.sh doc DEV=1 make -C docs/ html
 ```
-docker build -t mxnet/docs .
-docker run -it -p 8008:8008 mxnet/docs
-open http://localhost:8008/
-```
-
-### Faster iterative development
 
-If you are working on the docs and want to rebuild them without creating a new
-docker image each time, you can do this with
-
-```
-docker run -it -p 8008:8008 -v `pwd`:/opt/mxnet/docs mxnet/docs
-```
+The built documents will be available at `docs/_build/html/`.
 
-which maps your current directory into the docker image to get any local
-changes.
-
-**NOTE:** Any changes to the API reference will not get rebuilt this way.
-The API reference docs are introspected from the built binaries, which
-in this Dockerfile are pulled from github/dmlc/master.  To work-around
-this, map a volume with your code changes into the container, and rebuild
-MXNet in the container before doing the doc build.  Or use the local
-build described below.
-
-## Local build
-
-To build the documentation without docker on your local machine, first
-install the required packages for Ubuntu 14.04.  These are approximately:
-
-```
-sudo apt-get install doxygen python-pip
-sudo pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock==1.0.1 recommonmark
-```
+Note:
 
-(Refer to the Dockerfile for a more reliable description of the dependencies.)
-Once the MXNet binaries are built, and you have the dependencies installed,
-you can build the docs with:
+- If C++ codes have been changed, we suggest to remove the previous results before
+  building, namely run `rm -rf docs/_build/html/`.
 
-```make html```
+- If CSS or javascript are changed, we often need to do a *force refresh* in the
+  browser to clear the cache.
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
new file mode 100644
index 000000000000..77ef94074c57
--- /dev/null
+++ b/docs/_static/js/options.js
@@ -0,0 +1,23 @@
+$(document).ready(function () {
+    function label(lbl) {
+        return lbl.replace(/[ .]/g, '-').toLowerCase();
+    }
+    function showContent() {
+        $('.opt-group .opt').each(function(){
+            $('.'+label($(this).text())).hide();
+            $('.highlight-'+label($(this).text())).hide();
+        });
+        $('.opt-group .active').each(function(){
+            $('.'+label($(this).text())).show();
+            $('.highlight-'+label($(this).text())).show();
+        });
+    }
+    showContent();
+    function setContent() {
+        var el = $(this);
+        el.siblings().removeClass('active');
+        el.addClass('active');
+        showContent();
+    }
+    $('.opt-group').on('click', '.opt', setContent);
+});
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index a9d5c5660d3b..04e00b4daef9 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -34,7 +34,7 @@ <h3><i class="fa fa-cube"></i> Portable</h3>
       </div>
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-wrench"></i>Multiple Languages</h3>
-        <p>Supports multiple languages, including C++, Python, R, Scala, Julia, Matlab and Javascript - All with the same amazing performance.</p>
+        <p>Supports multiple languages, including C++, Python, R, Scala, Julia, Perl, Matlab and Javascript - All with the same amazing performance.</p>
       </div>
       <div class="col-lg-4 col-sm-6">
         <h3><i class="fa fa-cogs"></i> Auto-Differentiation</h3>
@@ -62,7 +62,7 @@ <h3><i class="fa fa-rocket"></i> Performance</h3>
         MXNet and sponsoring its major developers (alphabetical order).
       </p>
       <div class="col-lg-4 col-sm-6">
-        <img height="60px" src="https://upload.wikimedia.org/wikipedia/commons/1/1d/AmazonWebservices_Logo.svg">
+        <img height="60px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/aws-logo.png">
       </div>
 
       <div class="col-lg-4 col-sm-6">
@@ -72,7 +72,7 @@ <h3><i class="fa fa-rocket"></i> Performance</h3>
       <div class="col-lg-4 col-sm-6 smallClear">
         <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/cmu-logo.png">
       </div>
-      
+
       <div class="col-lg-4 col-sm-6">
         <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/dato-logo.png">
       </div>
@@ -86,17 +86,17 @@ <h3><i class="fa fa-rocket"></i> Performance</h3>
       </div>
 
       <div class="col-lg-4 col-sm-6 clear smallClear">
-        <img height="70px" src="https://upload.wikimedia.org/wikipedia/en/2/21/Nvidia_logo.svg">
+        <img height="40px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/nvidia-logo.png">
       </div>
 
       <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/MIT_logo.svg/200px-MIT_logo.svg.png">
+        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/mit-logo.png">
       </div>
-     
+
       <div class="col-lg-4 col-sm-6 smallClear">
         <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/ms-logo.png">
       </div>
-        
+
       <div class="col-lg-4 col-sm-6 clear smallClear">
         <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/hkust-logo.png">
       </div>
@@ -106,15 +106,15 @@ <h3><i class="fa fa-rocket"></i> Performance</h3>
       </div>
 
       <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="http://www.toolkit.ualberta.ca/Toolkit%20Downloads/~/media/identity/Toolkit/Logos/UA/UA-COLOUR-180px.png">
+        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/university-alberta-logo.png">
       </div>
-        
+
       <div class="col-lg-4 col-sm-6 clear smallClear">
-        <img height="45px" src="http://students.washington.edu/habitat/images/UW_NewLogo.jpg">
+        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/university-washington-logo.jpg">
       </div>
 
       <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="http://company.wolfram.com/data/press-center/uploads/2016/08/wolfram-corporate-logo-horizontal-lg.png">
+        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/wolfram-logo.png">
       </div>
     </div>
   </div>
diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index 85e7bdbbbe47..d4926b993cac 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -65,6 +65,19 @@
 
     <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
     <script type="text/javascript"> jQuery(function() { Search.loadIndex("searchindex.js"); Search.init();}); </script>
+
+    <script>
+      (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new
+      Date();a=s.createElement(o),
+      m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-96378503-1', 'auto');
+      ga('send', 'pageview');
+
+    </script>
+
     <!-- {%- for scriptfile in script_files %} -->
     <!-- <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script> -->
     <!-- {%- endfor %} -->
@@ -92,7 +105,7 @@
     {{ metatags }}
     {%- block htmltitle %}
     {%- if pagename != 'index' %}
-	
+
     <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
     {%- else %}
     <title>MXNet Documents</title>
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index f5b2cb7f0de8..eff7d0810a3e 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -21,7 +21,7 @@
         <li class="dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
           <ul class="dropdown-menu">
-            {% for lang in ['Python', 'R', 'Julia', 'C++', 'Scala'] %}
+            {% for lang in ['Python', 'R', 'Julia', 'C++', 'Scala', 'Perl'] %}
             <li><a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/{{lang.lower()}}/index.html">
                 {{lang}}
             </a></li>
@@ -42,8 +42,6 @@
 Previous Navbar Layout End -->
 
 <div class="navbar navbar-fixed-top">
-<a href="https://github.com/dmlc/mxnet"><img style="position: absolute; top: 0; right: 0; border: 0; z-index: 0"
-src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/github_fork_me_green_ribbon.png" alt="Fork me on GitHub"/>
   <div class="container" id="navContainer">
     <div id="header-inner" class="innder">
       <h1 id="logo-wrap">
@@ -59,22 +57,18 @@ <h1 id="logo-wrap">
           <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
           <ul id="package-dropdown-menu" class="dropdown-menu">
             <li><a class="main-nav-link" href="{{url_root}}api/python/index.html">Python</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/python/module.html">- Module</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/python/model.html">- Model</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/python/symbol.html">- Symbol</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/python/io.html">- I/O</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/python/ndarray.html">- NDArray</a></li>
-            <li><a class="main-nav-link" href="{{url_root}}api/python/kvstore.html">- KVStore</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/scala/index.html">Scala</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/r/index.html">R</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/julia/index.html">Julia</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/c++/index.html">C++</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}api/perl/index.html">Perl</a></li>
           </ul>
           {% endfor %}
         </span>
 
         <a class="main-nav-link" href="{{url_root}}architecture/index.html">Architecture</a>
-		<a class="main-nav-link" href="{{url_root}}community/index.html">Community</a>
+		<!-- <a class="main-nav-link" href="{{url_root}}community/index.html">Community</a> -->
+        <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
       </nav>
       <script> function getRootPath(){ return "{{url_root}}" } </script>
       <div class="burgerIcon dropdown">
@@ -87,7 +81,7 @@ <h1 id="logo-wrap">
               <li class="dropdown-submenu">
                 <a href="#" tabindex="-1">{{name}}</a>
                 <ul class="dropdown-menu">
-                  {% for lang in ['Python', 'Scala', 'R', 'Julia', 'C++'] %}
+                  {% for lang in ['Python', 'Scala', 'R', 'Julia', 'C++', 'Perl'] %}
                     <li><a tabindex="-1" href="{{url_root}}{{name.lower()|replace(" ", "_")}}/{{lang.lower()}}/index.html">{{lang}}</a>
                     </li>
                   {% endfor %}
@@ -108,16 +102,16 @@ <h1 id="logo-wrap">
       <div id='searchIcon'>
           <span class="glyphicon glyphicon-search" aria-hidden="true"></span>
       </div>
-      <div id="lang-select-wrap"> 
-        <label id="lang-select-label">
-          <!-- <i class="fa fa-globe"></i> -->
-          <span></span>
-        </label>
-        <select id="lang-select">
-          <option value="en">Eng</option>
-          <option value="zh">中文</option>
-        </select>
-      </div>
+      <!-- <div id="lang-select-wrap"> -->
+      <!--   <label id="lang-select-label"> -->
+      <!--     <\!-- <i class="fa fa-globe"></i> -\-> -->
+      <!--     <span></span> -->
+      <!--   </label> -->
+      <!--   <select id="lang-select"> -->
+      <!--     <option value="en">Eng</option> -->
+      <!--     <option value="zh">中文</option> -->
+      <!--   </select> -->
+      <!-- </div> -->
   <!--     <a id="mobile-nav-toggle">
         <span class="mobile-nav-toggle-bar"></span>
         <span class="mobile-nav-toggle-bar"></span>
diff --git a/docs/_static/mxnet.css b/docs/_static/mxnet.css
index 09e73eb5342f..131bc02b7fe0 100644
--- a/docs/_static/mxnet.css
+++ b/docs/_static/mxnet.css
@@ -3,7 +3,7 @@
 html, body {
     margin: 0;
     padding: 0;
-    background-color: #ffffff; 
+    background-color: #ffffff;
 }
 
 body {
@@ -13,87 +13,82 @@ body {
 body, div {
     font-family: "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
     font-size: 16px;
-    color: #444444; 
+    color: #000;
 }
 
 p {
     font-family: "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
-    color: #444444;
+    color: #000;
     font-size: 16px;
-    line-height: 1.5em; 
-}
-
-p.title {
-    margin-bottom: 12px; 
+    line-height: 1.5em;
 }
 
 #language-filter {
     float: right;
-    margin-top: 5px; 
+    margin-top: 5px;
 }
 
 li, dt a, dt span {
     font-size: 16px;
-    line-height: 1.5em; 
+    line-height: 1.5em;
 }
 
-/*Content paragraph space*/
-div.content p {
-    margin-top: 20px;
-}
+/* /\*Content paragraph space*\/ */
+/* div.content p { */
+/*     margin-top: 20px; */
+/* } */
 
 /*------------------ Headings -------------------*/
 h1, h2, h3 {
     font-family: "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
-    color: #444444; 
+    margin-top: 10px;
+    margin-bottom: 10px;
 }
 
 h1 {
-    font-size: 20px;
+    font-size: 22px;
     font-weight: bold;
-    padding-top: 25px; 
+    padding-top: 10px;
+    color: #cc6600;
 }
 
 h2 {
-    font-size: 18px;
-    font-weight: bold;
-    padding-top: 1.5em;
-    margin: 1em 0;
-    color: #cc6600; 
+    font-size: 20px;
+    /* font-weight: bold; */
+    padding-top: 10px;
+    color: #187dbb;
 }
 
 h3 {
-    font-size: 16px;
-    padding-top: 1.2em;
-    font-weight: bold;
-    color: #007697; 
+    font-size: 18px;
+    padding-top: 8px;
+    /* font-weight: bold; */
+    color: #187dbb;
 }
 
 h3.orange {
     color: #cc6600; }
 
 h4 {
-    color: #cc6600;
+    color: #187dbb;
     font-size: 16px;
-    padding-top: 1.8em;
+    padding-top: 6px;
     font-style: italic;
-    font-weight: bold; 
+    /* font-weight: bold; */
 }
 
 h5 {
     color: #333333;
     font-size: 16px;
-    padding-top: 1em;
     font-style: italic;
-    font-weight: bold; 
+    font-weight: bold;
 }
 
 h6 {
     color: #333333;
     font-size: 14px;
-    padding-top: 1em;
     font-style: italic;
-    font-weight: bold; 
+    font-weight: bold;
 }
 
 img {
@@ -102,14 +97,14 @@ img {
     border-color: #ffffff;
     /* white  */
     padding: 0px 0px 0px 0px;
-    margin: 0px 0px 0px 0px; 
+    margin: 0px 0px 0px 0px;
 }
 
 .topictitle {
     font-size: 24px;
     font-weight: bold;
     color: #e47911;
-    padding: 0 0 14px 0; 
+    padding: 0 0 14px 0;
 }
 
 .section:before {
@@ -124,7 +119,7 @@ img {
     background-color:#0079b2;
     opacity: 0.9;
     border: 0px;
-    height: 80px;
+    height: 60px;
     margin-bottom: 0px;
 }
 
@@ -135,7 +130,7 @@ img {
 
 @media screen and (max-width: 510px) {
     .navbar {
-        height: 100px;
+        height: 60px;
     }
 }
 
@@ -174,7 +169,7 @@ img {
     width: 125px;
 }
 
-@media screen and (max-width: 405px) {  
+@media screen and (max-width: 405px) {
     #logo-wrap {
         margin-left: -25px;
     }
@@ -226,7 +221,7 @@ img {
 
 #package-dropdown-menu {
     top: 36px;
-    border-radius: 4px; 
+    border-radius: 4px;
     padding: 0;
 }
 
@@ -240,7 +235,7 @@ img {
     display: block;
     position: fixed;
     top: 15px;
-    right: 155px;
+    right: 40px;
     width: 140px;
 }
 
@@ -276,11 +271,11 @@ img {
         margin-left: -100px;
         width: 200px;
     }
-    
+
     div .searchBox {
         width: 200px;
     }
-    
+
     #searchIcon {
         display: block;
     }
@@ -340,7 +335,7 @@ div .burgerIcon a {
     color: white;
 }
 
-@media screen and (max-width: 405px) { 
+@media screen and (max-width: 405px) {
     div .burgerIcon {
         margin-left: -25px;
     }
@@ -439,7 +434,7 @@ li.dropdown-submenu ul.dropdown-menu a {
     display: none;
     position: fixed;
     top: 20px;
-    right: 285px;
+    right: 175px;
     width: 30px;
 }
 
@@ -451,7 +446,7 @@ li.dropdown-submenu ul.dropdown-menu a {
     div .plusIcon, .nav-bar {
         display: none !important;
     }
-    
+
     div .burgerIcon {
         display: block;
         position: fixed;
@@ -551,6 +546,7 @@ li.dropdown-submenu ul.dropdown-menu a {
 
 .section-tout h3{
     font-size:20px;
+    color: #0079b2;
 }
 
 .section-tout p {
@@ -607,7 +603,7 @@ li.dropdown-submenu ul.dropdown-menu a {
 
 /*------------------content----------------------*/
 div.navbar + div.container {
-    padding-top: 85px;
+    padding-top: 65px;
 }
 
 div.content {
@@ -621,7 +617,7 @@ div.content {
 @media (max-width: 999px) {
     div.content {
         left: 0;
-        margin-left: 5%; 
+        margin-left: 5%;
         overflow-x: auto;
     }
 }
@@ -635,16 +631,16 @@ div.sphinxsidebar {
     width: 250px;
     padding:20px 30px 15px 0;
     height: calc(100% - 80px);
-    overflow: scroll;
     display: none;
+    overflow: auto;
 }
 
-div.sphinxsidebar ul { 
+div.sphinxsidebar ul {
     padding: 0;
     list-style-type: none !important;
 }
 
-div.sphinxsidebarwrapper > ul { 
+div.sphinxsidebarwrapper > ul {
     padding-left: 20px;
 }
 
@@ -759,23 +755,21 @@ div.informaltable {
   text-align: left; }
 
 /*----------------API class and function formatting---------------------*/
-dl.class > dt:before, dl.function > dt:before {
+dl > dt:before {
     content: " ";
     display: block;
-    height: 80px; /* fixed header height*/
-    margin: -50px 0 0; /* negative fixed header height */
-}
-
-dt {
-    background-color: white !important;
+    height: 70px; /* fixed header height*/
+    margin-top: -70px; /* negative fixed header height */
 }
 
-dl.method, dl.attribute {
-    padding-top: 15px;
+p.rubric {
+    margin-top: 10px;
 }
 
-dl.class p, dl.function p {
-    color: #cc6600;
+dt:target, .highlighted {
+    background-color: #fff;
+    border-bottom: 3px solid #c7254e;
+    margin-bottom: -3px;
 }
 
 /*----------------Model zoo page style------------------*/
@@ -802,4 +796,87 @@ dl.class p, dl.function p {
 
 #mxnet-model-zoo tr:nth-child(even) {
     background-color: #f2f2f2;
-}
\ No newline at end of file
+}
+
+/*--------- note block -----------*/
+div.admonition {
+    padding: 15px;
+    background-color: #f6f8fa;
+    border: 1px solid rgb(204, 204, 204);
+    border-radius: 4px;
+}
+
+p.admonition-title {
+    color: #187dbb;
+}
+
+/*------------ code block ------------*/
+pre {
+    background-color: #f6f8fa;
+}
+
+/*-------------API table---------------------*/
+table.docutils {
+    margin-top: 15px;
+}
+
+table.docutils td, table.docutils th {
+    padding-top: 5px;
+    padding-bottom: 5px;
+    padding-left: 10px;
+    padding-right: 10px;
+    /* border-top: 0; */
+    /* border-left: 0; */
+    /* border-right: 0; */
+    border: 1px solid rgb(223, 226, 229);
+}
+
+table.docutils tr:nth-child(even) {
+    background-color: #f6f8fa;
+}
+
+/*---------------getting started homepage---------*/
+
+#lang-demo ul {
+    margin-top: 20px;
+    margin-bottom: 15px;
+}
+
+.option-title {
+    width: 100px;
+    float: left;
+    clear: none;
+    text-align: right;
+    font-size: 14px;
+    padding-top: 7px;
+    padding-bottom: 8px;
+    padding-right: 10px;
+    font-weight: bold;
+}
+.option-row {
+    padding-bottom: 8px;
+}
+
+.install-inst {
+}
+
+.btn-default:hover, .btn-default:focus, .btn-default:active,
+.btn-default.active.focus, .btn-default.active:focus, .btn-default.active:hover,
+.btn-default:active.focus, .btn-default:active:focus, .btn-default:active:hover,
+.btn-default.active, .open>.dropdown-toggle.btn-default,
+.btn-default:active:focus {
+    color: #fff;
+    background-color: #0079b2;
+    border-color: #0079b2;
+}
+
+#setup-options {
+    margin-top: 15px;
+    margin-bottom: 15px;
+    margin-left: 30px;
+}
+
+.opt-group {
+    margin-top: 10px;
+    margin-bottom: 10px;
+}
diff --git a/docs/api/perl/index.md b/docs/api/perl/index.md
new file mode 100644
index 000000000000..19f70fa97883
--- /dev/null
+++ b/docs/api/perl/index.md
@@ -0,0 +1,64 @@
+# MXNet - Perl API
+
+MXNet supports the Perl programming language. The MXNet Perl package brings flexible and efficient GPU
+computing and state-of-art deep learning to Perl. It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
+It also lets you construct and customize the state-of-art deep learning models in Perl,
+  and apply them to tasks, such as image classification and data science challenges.
+
+One important thing to internalize is that Perl interface is written to be as close as possible to the Python's API,
+so most if not all of Python's documentation and examples should just work in Perl after making few
+changes in order to make the code a bit more Perlish. In nutshell just add $ sigils and replace . = \n with -> => ; and in 99% of cases
+that's all that is needed there.
+In addition please refer to [excellent metacpan doc interface](https://metacpan.org/release/AI-MXNet) and to very detailed
+[MXNet Python API Documentation](http://mxnet.io/api/python/index.html).
+
+AI::MXNet is seamlessly glued with PDL, the C++ level state can be easily initialized from PDL and the results can be
+transferred to PDL objects in order to allow you to use all the glory and power of the PDL!
+
+Here is how you can perform tensor or matrix computation in Perl with AI::MXNet and PDL:
+
+```perl
+pdl> use AI::MXNet qw(mx); # creates 'mx' module on the fly with the interface close to the Python's API
+
+pdl> print $arr = mx->nd->ones([2, 3])
+<AI::MXNet::NDArray 2x3 @cpu(0)>
+
+pdl> print Data::Dumper::Dumper($arr->shape)
+$VAR1 = [
+          2,
+          3
+        ];
+
+pdl> print (($arr*2)->aspdl) ## converts AI::MXNet::NDArray object to PDL object
+
+[
+ [2 2 2]
+ [2 2 2]
+]
+
+pdl> print $arr = mx->nd->array([[1,2],[3,4]]) ## init the NDArray from Perl array ref given in PDL::pdl constructor format
+<AI::MXNet::NDArray 2x2 @cpu(0)>
+pdl> print $arr->aspdl
+
+[
+ [1 2]
+ [3 4]
+]
+
+## init the NDArray from PDL but be aware that PDL methods expect the dimensions order in column major format
+## AI::MXNet::NDArray is row major
+pdl> print mx->nd->array(sequence(2,3))->aspdl ## 3 rows, 2 columns
+
+[
+ [0 1]
+ [2 3]
+ [4 5]
+]
+```
+ ## Perl API Reference
+ * [Module API](module.md) is a flexible high-level interface for training neural networks.
+ * [Symbolic API](symbol.md) performs operations on NDArrays to assemble neural networks from layers.
+ * [IO Data Loading API](io.md) performs parsing and data loading.
+ * [NDArray API](ndarray.md) performs vector/matrix/tensor operations.
+ * [KVStore API](kvstore.md) performs multi-GPU and multi-host distributed training.
+
diff --git a/docs/api/perl/io.md b/docs/api/perl/io.md
new file mode 100644
index 000000000000..3310f26aba18
--- /dev/null
+++ b/docs/api/perl/io.md
@@ -0,0 +1,111 @@
+# Data Loading API
+
+## Overview
+
+A data iterator reads data batch by batch.
+
+```perl
+pdl> $data = mx->nd->ones([100,10])
+pdl> $nd_iter = mx->io->NDArrayIter($data, batch_size=>25)
+pdl> for my $batch (@{ $nd_iter }) { print $batch->data->[0],"\n" }
+<AI::MXNet::NDArray 25x10 @cpu(0)>
+<AI::MXNet::NDArray 25x10 @cpu(0)>
+<AI::MXNet::NDArray 25x10 @cpu(0)>
+<AI::MXNet::NDArray 25x10 @cpu(0)>
+```
+
+If `$nd_iter->reset()` is called, then reads the data again from beginning.
+
+In addition, an iterator provides information about the batch, including the
+shapes and name.
+
+```perl
+pdl> $nd_iter = mx->io->NDArrayIter(data=>{data => mx->nd->ones([100,10])}, label=>{softmax_label => mx->nd->ones([100])}, batch_size=>25)
+pdl> print($nd_iter->provide_data->[0],"\n")
+DataDesc[data,25x10,float32,NCHW]
+pdl> print($nd_iter->provide_label->[0],"\n")
+DataDesc[softmax_label,25,float32,NCHW]
+```
+
+So this iterator can be used to train a symbol whose input data variable has
+name `data` and input label variable has name `softmax_label`.
+
+
+```perl
+pdl> $data  = mx->sym->Variable('data')
+pdl> $label = mx->sym->Variable('softmax_label')
+pdl> $fullc = mx->sym->FullyConnected(data=>$data, num_hidden=>1)
+pdl> $loss  = mx->sym->SoftmaxOutput(data=>$data, label=>$label)
+pdl> $mod   = mx->mod->Module($loss)
+pdl> print($mod->data_names->[0])
+data
+pdl> print($mod->label_names->[0])
+softmax_label
+pdl> $mod->bind(data_shapes=>$nd_iter->provide_data, label_shapes=>$nd_iter->provide_label)
+```
+
+Then we can call `$mod->fit($nd_iter, num_epoch=>2)` to train `loss` by 2 epochs.
+
+## Predefined Data iterators
+
+```perl
+mx->io->NDArrayIter
+mx->io->CSVIter
+mx->io->ImageRecordIter
+mx->io->ImageRecordUInt8Iter
+mx->io->MNISTIter
+mx->recordio->MXRecordIO
+mx->recordio->MXIndexedRecordIO
+mx->image->ImageIter
+```
+
+## Helper classes and functions
+
+Data structures and other iterators provided in the `AI::MXNet::IO` package.
+
+```perl
+AI::MXNet::DataDesc
+AI::MXNet::DataBatch
+AI::MXNet::DataIter
+AI::MXNet::ResizeIter
+AI::MXNet::MXDataIter
+```
+
+A list of image modification functions provided by `AI::MXNet::Image`.
+
+```perl
+mx->image->imdecode
+mx->image->scale_down
+mx->image->resize_short
+mx->image->fixed_crop
+mx->image->random_crop
+mx->image->center_crop
+mx->image->color_normalize
+mx->image->random_size_crop
+mx->image->ResizeAug
+mx->image->RandomCropAug
+mx->image->RandomSizedCropAug
+mx->image->CenterCropAug
+mx->image->RandomOrderAug
+mx->image->ColorJitterAug
+mx->image->LightingAug
+mx->image->ColorNormalizeAug
+mx->image->HorizontalFlipAug
+mx->image->CastAug
+mx->image->CreateAugmenter
+```
+
+Functions to read and write RecordIO files.
+
+```perl
+mx->recordio->pack
+mx->recordio->unpack
+mx->recordio->unpack_img
+```
+
+## Develop a new iterator
+
+Writing a new data iterator in Perl is straightforward. Most MXNet
+training/inference programs accept an object with ``provide_data``
+and ``provide_label`` properties.
+Please refer to AI-MXNet/examples for the examples of custom iterators.
diff --git a/docs/api/perl/kvstore.md b/docs/api/perl/kvstore.md
new file mode 100644
index 000000000000..7ff73be06b07
--- /dev/null
+++ b/docs/api/perl/kvstore.md
@@ -0,0 +1,109 @@
+# KVStore API
+
+Topics:
+* [Basic Push and Pull](#basic-push-and-pull)
+* [List Key-Value Pairs](#list-key-value-pairs)
+
+## Basic Push and Pull
+
+Provides basic operation over multiple devices (GPUs) on a single device.
+
+### Initialization
+
+Let's consider a simple example. It initializes
+a (int, NDArray) pair into the store, and then pulls the value out.
+
+```perl
+pdl> $kv = mx->kv->create('local')
+pdl> $shape = [2,3]
+pdl> $kv->init(3, mx->nd->ones($shape)*2)
+pdl> $a = mx->nd->zeros($shape)
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [2 2 2]
+ [2 2 2]
+]
+```
+
+### Push, Aggregation, and Updater
+
+For any key that's been initialized, you can push a new value with the same shape to the key, as follows:
+
+```perl
+pdl> $kv->push(3, mx->nd->ones($shape)*8)
+pdl> $a = mx->nd->zeros($shape)
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [8 8 8]
+ [8 8 8]
+]
+```
+
+The data that you want to push can be stored on any device. Furthermore, you can push multiple
+values into the same key, where KVStore first sums all of these
+values, and then you pull the aggregated value, as follows:
+
+```perl
+pdl> $kv->push(3, [mx->nd->ones($shape, ctx=>mx->cpu(0)), mx->nd->ones($shape, ctx=>mx->cpu(1))])
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [2 2 2]
+ [2 2 2]
+]
+```
+
+For each push command, KVStore applies the pushed value to the value stored by an
+`updater`. The default updater is `ASSIGN`. You can replace the default to
+control how data is merged.
+
+```perl
+pdl> $updater = sub { my ($key, $input, $stored) = @_; print "update on key: $key\n"; $stored += $input * 3; }
+pdl> $kv->_set_updater($updater)
+pdl> $kv->push(3, [mx->nd->ones($shape, ctx=>mx->cpu(0)), mx->nd->ones($shape, ctx=>mx->cpu(1))])
+update on key: 3
+pdl> $kv->pull(3, out => $a)
+pdl> print $a->aspdl
+[
+ [8 8 8]
+ [8 8 8]
+]
+```
+
+### Pull
+
+You've already seen how to pull a single key-value pair. Similar to the way that you use the push command, you can
+pull the value into several devices with a single call.
+
+```perl
+pdl> $b = [mx->nd->zeros($shape, ctx=>mx->cpu(0)), mx->nd->zeros($shape, ctx=>mx->cpu(1))]
+pdl> $kv->pull(3, out => $b)
+pdl> print $b->[1]->aspdl
+[
+ [8 8 8]
+ [8 8 8]
+]
+```
+
+## List Key-Value Pairs
+
+All of the operations that we've discussed so far are performed on a single key. KVStore also provides
+the interface for generating a list of key-value pairs. For a single device, use the following:
+
+```perl
+pdl> $keys = [5,7,9]
+pdl> $kv->init($keys, [map { mx->nd->ones($shape) } 0..@$keys-1])
+pdl> $kv->push($keys, [map { mx->nd->ones($shape) } 0..@$keys-1])
+update on key: 5
+update on key: 7
+update on key: 9
+pdl> $b = [map { mx->nd->ones($shape) } 0..@$keys-1]
+pdl> $kv->pull($keys, out => $b)
+pdl> print $b->[1]->aspdl
+[
+ [4 4 4]
+ [4 4 4]
+]
+```
diff --git a/docs/api/perl/module.md b/docs/api/perl/module.md
new file mode 100644
index 000000000000..132e333dac9c
--- /dev/null
+++ b/docs/api/perl/module.md
@@ -0,0 +1,45 @@
+# Module API
+
+## Overview
+
+The module API, defined in the `module` (or simply `mod`) package (`AI::MXNet::Module` under the hood), provides an
+intermediate and high-level interface for performing computation with a
+`AI::MXNet::Symbol` or just `mx->sym`. One can roughly think a module is a machine which can execute a
+program defined by a `Symbol`.
+
+The class `AI::MXNet::Module` is a commonly used module, which accepts a `AI::MXNet::Symbol` as
+the input:
+
+```perl
+pdl> $data = mx->symbol->Variable('data')
+pdl> $fc1  = mx->symbol->FullyConnected($data, name=>'fc1', num_hidden=>128)
+pdl> $act1 = mx->symbol->Activation($fc1, name=>'relu1', act_type=>"relu")
+pdl> $fc2  = mx->symbol->FullyConnected($act1, name=>'fc2', num_hidden=>10)
+pdl> $out  = mx->symbol->SoftmaxOutput($fc2, name => 'softmax')
+pdl> $mod  = mx->mod->Module($out)  # create a module by given a Symbol
+```
+
+Assume there is a valid MXNet data iterator `data`. We can initialize the
+module:
+
+```perl
+pdl> $mod->bind(data_shapes=>$data->provide_data,
+         label_shapes=>$data->provide_label)  # create memory by given input shapes
+pdl> $mod->init_params()  # initial parameters with the default random initializer
+```
+
+Now the module is able to compute. We can call high-level API to train and
+predict:
+
+```perl
+pdl> $mod->fit($data, num_epoch=>10, ...)  # train
+pdl> $mod->predict($new_data)  # predict on new data
+```
+
+or use intermediate APIs to perform step-by-step computations
+
+```perl
+pdl> $mod->forward($data_batch, is_train => 1)  # forward on the provided data batch
+pdl> $mod->backward()  # backward to calculate the gradients
+pdl> $mod->update()  # update parameters using the default optimizer
+```
diff --git a/docs/api/perl/ndarray.md b/docs/api/perl/ndarray.md
new file mode 100644
index 000000000000..819e09f9d9f5
--- /dev/null
+++ b/docs/api/perl/ndarray.md
@@ -0,0 +1,42 @@
+# NDArray API
+
+## Overview
+
+A `AI::MXNet::NDArray` is a multidimensional container of items of the same type and
+size. Various methods for data manipulation and computation are provided.
+
+```perl
+pdl> $x = mx->nd->array([[1, 2, 3], [4, 5, 6]])
+pdl> print $x->aspdl->shape
+[3, 2]
+pdl> $y = $x + mx->nd->ones($x->shape)*3
+pdl> print $y->aspdl
+[
+ [4 5 6]
+ [7 8 9]
+]
+pdl> $z = $y->as_in_context(mx->gpu(0))
+pdl> print $z,"\n"
+<AI::MXNet::NDArray 2x3 @gpu(0)>
+```
+
+A detailed tutorial is available at
+[http://mxnet.io/tutorials/python/ndarray.html](http://mxnet.io/tutorials/python/ndarray.html).
+
+Note: AI::MXNet::NDarray is similar to numpy.ndarray in some aspects. But the difference is not negligible. For example
+
+- AI::MXNet::NDArray->T does real data transpose to return new a copied array, instead
+     of returning a view of the input array.
+- AI::MXNet::NDArray->dot performs dot between the last axis of the first input array
+     and the first axis of the second input, while numpy.dot uses the second
+     last axis of the input array.
+
+In additional, NDArray supports GPU computation and various neural
+network layers.
+
+AI::MXNet::NDarray also provides almost same routines as AI::MXNet::symbol. Most
+routines between these two packages share the same C++ operator source
+codes. But AI::MXNet::NDarray differs from AI::MXNet::Symbol in several aspects:
+
+- AI::MXNet::NDArray adopts imperative programming, namely sentences are executed
+     step-by-step so that the results can be obtained immediately.
diff --git a/docs/api/perl/symbol.md b/docs/api/perl/symbol.md
new file mode 100644
index 000000000000..bb93d3f75d05
--- /dev/null
+++ b/docs/api/perl/symbol.md
@@ -0,0 +1,127 @@
+# MXNet Perl Symbolic API
+
+Topics:
+
+* [How to Compose Symbols](#how-to-compose-symbols) introduces operator overloading of symbols.
+* [Symbol Attributes](#symbol-attributes) describes how to attach attributes to symbols.
+* [Serialization](#serialization) explains how to save and load symbols.
+* [Executing Symbols](#executing-symbols) explains how to evaluate the symbols with data.
+* [Multiple Outputs](#multiple-outputs) explains how to configure multiple outputs.
+
+## How to Compose Symbols
+
+The symbolic API provides a way to configure computation graphs.
+You can configure the graphs either at the level of neural network layer operations or as fine-grained operations.
+
+The following example configures a two-layer neural network.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $data = mx->symbold->Variable("data")
+pdl> $fc1  = mx->symbol->FullyConnected(data => $data, name => "fc1", num_hidden" -> 128)
+pdl> $act1 = mx->symbol->Activation(data => $fc1, name => "relu1", act_type => "relu")
+pdl> $fc2 =  mx->symbol->FullyConnected(data => $act1, name => "fc2", num_hidden => 64)
+pdl> $net =  mx->symbol->SoftmaxOutput(data => $fc2, name => "out")
+```
+
+The basic arithmetic operators (plus, minus, div, multiplication) are overloaded for
+*element-wise operations* of symbols.
+
+The following example creates a computation graph that adds two inputs together.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $a =  mx->symbol->Variable("a")
+pdl> $b =  mx->symbol->Variable("b")
+pdl> $c = $a + $b
+```
+
+## Symbol Attributes
+
+You can add an attribute to a symbol by providing an attribute hash when you create a symbol.
+
+```perl
+$data =  mx->symbol->Variable("data", attr => { mood => "angry" })
+$op   =  mx->symbol->Convolution(data => $data, kernel => [1, 1], num_filter => 1, attr => { mood => "so so" })
+```
+
+For proper communication with the C++ backend, both the key and values of the attribute dictionary should be strings. To retrieve the attributes, use `->attr($key)`:
+
+```
+    $data->attr("mood")
+```
+
+To attach attributes, you can use ```AI::MXNet::AttrScope```. ```AI::MXNet::AttrScopeAttrScope``` automatically adds 
+the specified attributes to all of the symbols created within that scope.
+The user can also inherit this object to change naming behavior. For example:
+
+```perl
+use AI::MXNet qw(mx);
+use Test::More tests => 3;
+my ($data, $gdata);
+{
+    local($mx::AttrScope) = mx->AttrScope(group=>4, data=>'great');
+    $data = mx->sym->Variable("data", attr => { dtype => "data", group => "1" });
+    $gdata = mx->sym->Variable("data2");
+}
+ok($gdata->attr("group") == 4);
+ok($data->attr("group") == 1);
+
+my $exceedScopeData = mx->sym->Variable("data3");
+ok((not defined $exceedScopeData->attr("group")), "No group attr in global attr scope");
+```
+
+## Serialization
+
+There are two ways to save and load the symbols. You can use the `mx->symbol->save` and `mxnet->symbol->load` functions to serialize the ```AI::MXNet::Symbol``` objects.
+The advantage of using `save` and `load` functions is that it is language agnostic and cloud friendly.
+The symbol is saved in JSON format. You can also get a JSON string directly using `$symbol->tojson`.
+
+The following example shows how to save a symbol to an S3 bucket, load it back, and compare two symbols using a JSON string.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $a = mx->sym->Variable("a")
+pdl> $b = mx->sym->Variable("b")
+pdl> $c = $a + $b
+pdl> $c->save("s3://my-bucket/symbol-c.json")
+pdl> $c2 = $c->load("s3://my-bucket/symbol-c.json")
+pdl> ok($c->tojson eq $c2->tojson)
+ok 1
+```
+
+## Executing Symbols
+
+After you have assembled a set of symbols into a computation graph, the MXNet engine can evaluate them.
+If you are training a neural network, this is typically
+handled by the high-level [AI::MXNet::Module package](module.md) and the [`fit()`] function.
+
+For neural networks used in "feed-forward", "prediction", or "inference" mode (all terms for the same
+thing: running a trained network), the input arguments are the
+input data, and the weights of the neural network that were learned during training.
+
+To manually execute a set of symbols, you need to create an [`AI::MXNet::Executor`] object,
+which is typically constructed by calling the [`simple_bind(<parameters>)`] method on a AI::MXNet::Symbol.
+
+## Multiple Outputs
+
+To group the symbols together, use the [AI::MXNet::Symbol->Group](#mxnet.symbol.Group) function.
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> use Data::Dumper
+pdl> $data  = mx->sym->Variable("data")
+pdl> $fc1   = mx->sym->FullyConnected($data, name => "fc1", num_hidden => 128)
+pdl> $act1  = mx->sym->Activation($fc1, name => "relu1", act_type => "relu")
+pdl> $fc2   = mx->sym->FullyConnected($act1, name => "fc2", num_hidden => 64)
+pdl> $net   = mx->sym->SoftmaxOutput($fc2, name => "softmax")
+pdl> $group = mx->sym->Group([$fc1, $net])
+pdl> print Dumper($group->list_outputs())
+$VAR1 = [
+    'fc1_output',
+    'softmax_output'
+];
+```
+
+After you get the ```Group```, you can bind on ```group``` instead.
+The resulting executor will have two outputs, one for fc1_output and one for softmax_output.
diff --git a/docs/api/python/index.md b/docs/api/python/index.md
index f1394de05b6d..8411a7e886b0 100644
--- a/docs/api/python/index.md
+++ b/docs/api/python/index.md
@@ -1,14 +1,32 @@
 # MXNet - Python API
 
-MXNet supports the Python programming language. The MXNet Python package brings flexible and efficient GPU
-computing and state-of-art deep learning to Python. It enables you to write seamless tensor/matrix computation with multiple GPUs in Python. It also allows you to construct and customize state-of-art deep learning models in Python,
-  and apply them to tasks, such as image classification and data science challenges.
-
-
-## Python API Reference
-* [Module API](module.md) is a flexible high-level interface for training the neural networks.
-* [Model API](model.md) is an alternate simple high-level interface for training the neural networks.
-* [Symbolic API](symbol.md) performs operations on NDArrays to assemble neural networks from layers.
-* [IO Data Loading API](io.md) performs parsing and data loading.
-* [NDArray API](ndarray.md) performs vector/matrix/tensor operations.
-* [KVStore API](kvstore.md) performs multi-GPU and multi-host distributed training.
+Many docstrings contain example code, which
+demonstrates the basic usage of the routine. The examples assume that `MXNet` is
+imported with:
+
+```python
+>>> import mxnet as mx
+```
+
+```eval_rst
+
+.. note:: A convenient way to execute examples is the ``%doctest_mode`` mode of
+    Jupyter notebook, which allows for pasting of multi-line examples contains
+    ``>>>`` and preserves indentation. Run ``%doctest_mode?`` in Jupyter notebook
+    for more details.
+
+```
+
+## Table of contents
+
+```eval_rst
+.. toctree::
+   :maxdepth: 2
+
+   ndarray
+   symbol
+   module
+   kvstore
+   io
+   optimization
+```
diff --git a/docs/api/python/io.md b/docs/api/python/io.md
index 636b541782ac..53f76cfad271 100644
--- a/docs/api/python/io.md
+++ b/docs/api/python/io.md
@@ -1,186 +1,192 @@
-# MXNet Python Data Loading API
-This topic introduces the data input method for MXNet. MXNet uses an iterator to provide data to the neural network.  Iterators do some preprocessing and generate batches for the neural network.
+# Data Loading API
 
-MXNet provides basic iterators for MNIST and RecordIO images. To hide the cost of I/O, MXNet uses a prefetch strategy that enables parallelism for the learning process and data fetching. Data is automatically fetched by an independent thread.
+## Overview
 
-Topics:
+This document summeries supported data formats and iterator APIs to read the
+data including
 
-* [Data Iterator Parameters](#parameters-for-data-iterator) clarifies the different usages for Dataiter parameters.
-* [Create a Data Iterator](#create-a-data-iterator) introduces how to create a data iterator in MXNet for Python.
-* [How to Get Data](#how-to-get-data) introduces the data resource and data preparation tools.
-* [IO API Reference](#io-api-reference) explains the IO API.
-
-
-## Data Iterator Parameters
-
-To create a data iterator, you typically need to provide five parameters:
-
-* **Dataset Param** provides basic information about the dataset, e.g., file path, input shape.
-* **Batch Param** provides information required to form a batch, e.g., batch size.
-* **Augmentation Param** tells MXNet which augmentation operations (e.g., crop or mirror) to perform on an input image.
-* **Backend Param** controls the behavior of the back-end threads to hide the cost of data loading.
-* **Auxiliary Param** provides options for checking and debugging.
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-You *must* provide the **Dataset Param** and **Batch Param**, otherwise MXNet can't create the data batch. Provide other parameters as required by your algorithm and performance needs. We provide a detailed explanation and examples of the options later.
+    mxnet.io
+    mxnet.recordio
+    mxnet.image
+```
 
-## Create a Data Iterator
+It will also show how to write an iterator for a new data format.
 
-The IO API provides a simple way to create a data iterator in Python.
-The following example code shows how to create a CIFAR data iterator.
+A data iterator reads data batch by batch.
 
 ```python
-    dataiter = mx.io.ImageRecordIter(
-            # Utility Parameter
-            # Optional
-            # Name of the data, should match the name of the data input of the network
-            # data_name='data',
-            # Utility Parameter
-            # Optional
-            # Name of the label, should match the name of the label parameter of the network
-            # Usually, if the loss layer is named 'foo', then the label input has the name
-            # 'foo_label', unless overwritten
-            # label_name='softmax_label',
-            # Dataset Parameter
-            # Impulsary
-            # indicating the data file, please check the data is already there
-            path_imgrec="data/cifar/train.rec",
-            # Dataset Parameter
-            # Impulsary
-            # indicating the image size after preprocessing
-            data_shape=(3,28,28),
-            # Batch Parameter
-            # Impulsary
-            # tells how many images in a batch
-            batch_size=100,
-            # Augmentation Parameter
-            # Optional
-            # when offers mean_img, each image will subtract the mean value at each pixel
-            mean_img="data/cifar/cifar10_mean.bin",
-            # Augmentation Parameter
-            # Optional
-            # randomly crop a patch of the data_shape from the original image
-            rand_crop=True,
-            # Augmentation Parameter
-            # Optional
-            # randomly mirror the image horizontally
-            rand_mirror=True,
-            # Augmentation Parameter
-            # Optional
-            # randomly shuffle the data
-            shuffle=False,
-            # Backend Parameter
-            # Optional
-            # Preprocessing thread number
-            preprocess_threads=4,
-            # Backend Parameter
-            # Optional
-            # Prefetch buffer size
-            prefetch_buffer=1)
+>>> data = mx.nd.ones((100,10))
+>>> nd_iter = mx.io.NDArrayIter(data, batch_size=25)
+>>> for batch in nd_iter:
+...     print(batch.data)
+[<NDArray 25x10 @cpu(0)>]
+[<NDArray 25x10 @cpu(0)>]
+[<NDArray 25x10 @cpu(0)>]
+[<NDArray 25x10 @cpu(0)>]
 ```
 
-First, explicitly specify the kind of data (MNIST, ImageRecord, etc.) to fetch. Then, provide the options for the dataset, batching, image augmentation, multi-tread processing,  and prefetching operations. The code automatically validates the parameters. If a required parameter is missing, MXNet returns an error.
-
-## How to Get Data
+If `nd_iter.reset()` is called, then reads the data again from beginning.
 
+In addition, an iterator provides information about the batch, including the
+shapes and name.
 
-We provide [scripts](https://github.com/dmlc/mxnet/tree/master/scala-package/core/scripts) to download MNIST data and CIFAR10 ImageRecord data. If you want to create your own dataset, we recommend using the Image RecordIO data format.
-
-## Create a Dataset Using RecordIO
+```python
+>>> nd_iter = mx.io.NDArrayIter(data={'data':mx.nd.ones((100,10))},
+...                             label={'softmax_label':mx.nd.ones((100,))},
+...                             batch_size=25)
+>>> print(nd_iter.provide_data)
+[DataDesc[data,(25, 10L),<type 'numpy.float32'>,NCHW]]
+>>> print(nd_iter.provide_label)
+[DataDesc[softmax_label,(25,),<type 'numpy.float32'>,NCHW]]
+```
 
-RecordIO implements a file format for a sequence of records. We recommend storing images as records and packing them together. The benefits include:
+So this iterator can be used to train a symbol whose input data variable has
+name `data` and input label variable has name `softmax_label`.
 
-* Storing images in a compact format--e.g., JPEG, for records--greatly reduces the size of the dataset on the disk.
-* Packing data together allows continuous reading on the disk.
-* RecordIO has a simple way to partition, simplifying distributed setting. We provide an example later.
 
-We provide the [im2rec tool](https://github.com/dmlc/mxnet/blob/master/tools/im2rec.cc) so you can create an Image RecordIO dataset by yourself. The following walkthrough shows you how.
+```python
+>>> data = mx.sym.Variable('data')
+>>> label = mx.sym.Variable('softmax_label')
+>>> fullc = mx.sym.FullyConnected(data=data, num_hidden=1)
+>>> loss = mx.sym.SoftmaxOutput(data=data, label=label)
+>>> mod = mx.mod.Module(loss)
+>>> print(mod.data_names)
+['data']
+>>> print(mod.label_names)
+['softmax_label']
+>>> mod.bind(data_shapes=nd_iter.provide_data, label_shapes=nd_iter.provide_label)
+```
 
-### Prerequisites
-Download the data. You don't need to resize the images manually. You can use ```im2rec``` to resize them automatically. For details, see the "Extension: Using Multiple Labels for a Single Image," later in this topic.
+Then we can call `mod.fit(nd_iter, num_epoch=2)` to train `loss` by 2 epochs.
 
-### Step 1. Make an Image List File
-After you download the data, you need to make an image list file.  The format is:
+## Data iterators
 
+```eval_rst
+    .. currentmodule:: mxnet
 ```
-    integer_image_index \t label_index \t path_to_image
-```
-Typically, the program takes the list of names of all of the images, shuffles them, then separates them into two lists: a training filename list and a testing filename list. Write the list in the right format.
-
-This is an example file:
-
-```bash
-    95099  464     n04467665_17283.JPEG
-    10025081        412     ILSVRC2010_val_00025082.JPEG
-    74181   789     n01915811_2739.JPEG
-    10035553        859     ILSVRC2010_val_00035554.JPEG
-    10048727        929     ILSVRC2010_val_00048728.JPEG
-    94028   924     n01980166_4956.JPEG
-    1080682 650     n11807979_571.JPEG
-    972457  633     n07723039_1627.JPEG
-    7534    11      n01630670_4486.JPEG
-    1191261 249     n12407079_5106.JPEG
 
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    io.NDArrayIter
+    io.CSVIter
+    io.ImageRecordIter
+    io.ImageRecordUInt8Iter
+    io.MNISTIter
+    recordio.MXRecordIO
+    recordio.MXIndexedRecordIO
+    image.ImageIter
 ```
 
-### Step 2. Create the Binary File
-To generate a binary image, use `im2rec` in the tool folder. `im2rec` takes the path of the `_image list file_` you generated, the `_root path_` of the images, and the `_output file path_` as input. This process usually takes several hours, so be patient.
-
-Sample command:
-
-```bash
-    ./bin/im2rec image.lst image_root_dir output.bin resize=256
-```
-For more details, run ```./bin/im2rec```.
+## Helper classes and functions
 
-### Extension: Multiple Labels for a Single Image
 
-The `im2rec` tool and `mx.io.ImageRecordIter` have multi-label support for a single image.
-For example, if you have four labels for a single image, you can use the following procedure to use the RecordIO tools.
+Data structures and other iterators provided in the ``mxnet.io`` packages.
 
-1. Write the image list files as follows:
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    io.DataDesc
+    io.DataBatch
+    io.DataIter
+    io.ResizeIter
+    io.PrefetchingIter
+    io.MXDataIter
+```
 
-     ```
-         integer_image_index \t label_1 \t label_2 \t   label_3 \t label_4 \t path_to_image
-     ```
+A list of image modification functions provided by ``mxnet.image``.
 
-2. Run `im2rec`, adding a 'label_width=4' to the command argument, for example:
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    image.imdecode
+    image.scale_down
+    image.resize_short
+    image.fixed_crop
+    image.random_crop
+    image.center_crop
+    image.color_normalize
+    image.random_size_crop
+    image.ResizeAug
+    image.RandomCropAug
+    image.RandomSizedCropAug
+    image.CenterCropAug
+    image.RandomOrderAug
+    image.ColorJitterAug
+    image.LightingAug
+    image.ColorNormalizeAug
+    image.HorizontalFlipAug
+    image.CastAug
+    image.CreateAugmenter
+```
 
-     ```bash
-         ./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
-     ```
+Functions to read and write RecordIO files.
 
-3. In the iterator generation code, set `label_width=4` and `path_imglist=<<The PATH TO YOUR image.lst>>`, for example:
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-     ```python
-         dataiter = mx.io.ImageRecordIter(
-           path_imgrec="data/cifar/train.rec",
-           data_shape=(3,28,28),
-           path_imglist="data/cifar/image.lst",
-           label_width=4
-         )
-     ```
+    recordio.pack
+    recordio.unpack
+    recordio.unpack_img
+    recordio.pack_img
+```
 
+## Develop a new iterator
 
-4. Run the multi-label image iterator:
+Writing a new data iterator in Python is straightforward. Most MXNet
+training/inference program accepts an iteratable object with ``provide_data``
+and ``provide_label`` properties.
+This [tutorial](http://mxnet.io/tutorials/python/data.html#data-iterators) how to
+write an iterator from scratch.
 
-     ```eval_rst
-         .. raw:: html
+The following example demonstrates how to combine
+multiple data iterators into a single one. It can be used for multiple
+modality training such as image captioning, in which images can be  read by
+``ImageRecordIter`` while documents by ``CSVIter``
 
-             <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
-     ```
+```python
+class MultiIter:
+    def __init__(self, iter_list):
+        self.iters = iter_list
+    def next(self):
+        batches = [i.next() for i in self.iters]
+        return DataBatch(data=[*b.data for b in batches],
+                         label=[*b.label for b in batches])
+    def reset(self):
+        for i in self.iters:
+            i.reset()
+    @property
+    def provide_data(self):
+        return [*i.provide_data for i in self.iters]
+    @property
+    def provide_label(self):
+        return [*i.provide_label for i in self.iters]
+
+iter = MultiIter([mx.io.ImageRecordIter('image.rec'), mx.io.CSVIter('txt.csv')])
+```
 
+Parsing and another pre-processing such as augmentation may be expensive.
+If performance is critical, we can implement a data iterator in C++. Refer to
+[src/io](https://github.com/dmlc/mxnet/tree/master/src/io) for examples.
 
-## IO API Reference
+## API Reference
 
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
 
 ```eval_rst
-    .. automodule:: mxnet.io
-        :members:
-
-    .. raw:: html
-
-        <script>auto_index("io-api-reference");</script>
+.. automodule:: mxnet.io
+    :members:
+.. automodule:: mxnet.image
+    :members:
+.. automodule:: mxnet.recordio
+    :members:
 ```
-## Next Steps
-* [NDArray API](ndarray.md) for vector/matrix/tensor operations
-* [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/kvstore.md b/docs/api/python/kvstore.md
index 722f66760286..28297faca491 100644
--- a/docs/api/python/kvstore.md
+++ b/docs/api/python/kvstore.md
@@ -1,11 +1,5 @@
 # KVStore API
 
-Topics:
-* [Basic Push and Pull](#basic-push-and-pull)
-* [List Key-Value Pairs](#list-key-value-pairs)
-* [API Reference](#api-reference)
-
-
 ## Basic Push and Pull
 
 Provides basic operation over multiple devices (GPUs) on a single device.
@@ -16,14 +10,14 @@ Let's consider a simple example. It initializes
 a (`int`, `NDArray`) pair into the store, and then pulls the value out.
 
 ```python
-    >>> kv = mx.kv.create('local') # create a local kv store.
-    >>> shape = (2,3)
-    >>> kv.init(3, mx.nd.ones(shape)*2)
-    >>> a = mx.nd.zeros(shape)
-    >>> kv.pull(3, out = a)
-    >>> print a.asnumpy()
-    [[ 2.  2.  2.]
-     [ 2.  2.  2.]]
+>>> kv = mx.kv.create('local') # create a local kv store.
+>>> shape = (2,3)
+>>> kv.init(3, mx.nd.ones(shape)*2)
+>>> a = mx.nd.zeros(shape)
+>>> kv.pull(3, out = a)
+>>> print a.asnumpy()
+[[ 2.  2.  2.]
+ [ 2.  2.  2.]]
 ```
 
 ### Push, Aggregation, and Updater
@@ -31,11 +25,11 @@ a (`int`, `NDArray`) pair into the store, and then pulls the value out.
 For any key that's been initialized, you can push a new value with the same shape to the key, as follows:
 
 ```python
-    >>> kv.push(3, mx.nd.ones(shape)*8)
-    >>> kv.pull(3, out = a) # pull out the value
-    >>> print a.asnumpy()
-    [[ 8.  8.  8.]
-     [ 8.  8.  8.]]
+>>> kv.push(3, mx.nd.ones(shape)*8)
+>>> kv.pull(3, out = a) # pull out the value
+>>> print a.asnumpy()
+[[ 8.  8.  8.]
+ [ 8.  8.  8.]]
 ```
 
 The data that you want to push can be stored on any device. Furthermore, you can push multiple
@@ -43,13 +37,13 @@ values into the same key, where KVStore first sums all of these
 values, and then pushes the aggregated value, as follows:
 
 ```python
-    >>> gpus = [mx.gpu(i) for i in range(4)]
-    >>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
-    >>> kv.push(3, b)
-    >>> kv.pull(3, out = a)
-    >>> print a.asnumpy()
-    [[ 4.  4.  4.]
-     [ 4.  4.  4.]]
+>>> gpus = [mx.gpu(i) for i in range(4)]
+>>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
+>>> kv.push(3, b)
+>>> kv.pull(3, out = a)
+>>> print a.asnumpy()
+[[ 4.  4.  4.]
+ [ 4.  4.  4.]]
 ```
 
 For each push command, KVStore applies the pushed value to the value stored by an
@@ -57,20 +51,20 @@ For each push command, KVStore applies the pushed value to the value stored by a
 control how data is merged.
 
 ```python
-    >>> def update(key, input, stored):
-    >>>     print "update on key: %d" % key
-    >>>     stored += input * 2
-    >>> kv._set_updater(update)
-    >>> kv.pull(3, out=a)
-    >>> print a.asnumpy()
-    [[ 4.  4.  4.]
-     [ 4.  4.  4.]]
-    >>> kv.push(3, mx.nd.ones(shape))
-    update on key: 3
-    >>> kv.pull(3, out=a)
-    >>> print a.asnumpy()
-    [[ 6.  6.  6.]
-     [ 6.  6.  6.]]
+>>> def update(key, input, stored):
+>>>     print "update on key: %d" % key
+>>>     stored += input * 2
+>>> kv._set_updater(update)
+>>> kv.pull(3, out=a)
+>>> print a.asnumpy()
+[[ 4.  4.  4.]
+ [ 4.  4.  4.]]
+>>> kv.push(3, mx.nd.ones(shape))
+update on key: 3
+>>> kv.pull(3, out=a)
+>>> print a.asnumpy()
+[[ 6.  6.  6.]
+ [ 6.  6.  6.]]
 ```
 
 ### Pull
@@ -79,11 +73,11 @@ You've already seen how to pull a single key-value pair. Similar to the way that
 pull the value into several devices with a single call.
 
 ```python
-    >>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
-    >>> kv.pull(3, out = b)
-    >>> print b[1].asnumpy()
-    [[ 6.  6.  6.]
-     [ 6.  6.  6.]]
+>>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
+>>> kv.pull(3, out = b)
+>>> print b[1].asnumpy()
+[[ 6.  6.  6.]
+ [ 6.  6.  6.]]
 ```
 
 ## List Key-Value Pairs
@@ -92,50 +86,42 @@ All of the operations that we've discussed so far are performed on a single key.
 the interface for generating a list of key-value pairs. For a single device, use the following:
 
 ```python
-    >>> keys = [5, 7, 9]
-    >>> kv.init(keys, [mx.nd.ones(shape)]*len(keys))
-    >>> kv.push(keys, [mx.nd.ones(shape)]*len(keys))
-    update on key: 5
-    update on key: 7
-    update on key: 9
-    >>> b = [mx.nd.zeros(shape)]*len(keys)
-    >>> kv.pull(keys, out = b)
-    >>> print b[1].asnumpy()
-    [[ 3.  3.  3.]
-     [ 3.  3.  3.]]
+>>> keys = [5, 7, 9]
+>>> kv.init(keys, [mx.nd.ones(shape)]*len(keys))
+>>> kv.push(keys, [mx.nd.ones(shape)]*len(keys))
+update on key: 5
+update on key: 7
+update on key: 9
+>>> b = [mx.nd.zeros(shape)]*len(keys)
+>>> kv.pull(keys, out = b)
+>>> print b[1].asnumpy()
+[[ 3.  3.  3.]
+ [ 3.  3.  3.]]
 ```
 
 For multiple devices:
 
 ```python
-    >>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
-    >>> kv.push(keys, b)
-    update on key: 5
-    update on key: 7
-    update on key: 9
-    >>> kv.pull(keys, out = b)
-    >>> print b[1][1].asnumpy()
-    [[ 11.  11.  11.]
-     [ 11.  11.  11.]]
+>>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
+>>> kv.push(keys, b)
+update on key: 5
+update on key: 7
+update on key: 9
+>>> kv.pull(keys, out = b)
+>>> print b[1][1].asnumpy()
+[[ 11.  11.  11.]
+ [ 11.  11.  11.]]
 ```
 
-```eval_rst
-    .. raw:: html
-
-        <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
-```
 
 
 ## API Reference
 
-```eval_rst
-    .. automodule:: mxnet.kvstore
-        :members:
-
-    .. raw:: html
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
 
-        <script>auto_index("api-reference");</script>
+```eval_rst
+.. automodule:: mxnet.kvstore
+    :members:
 ```
 
-## Next Steps
-* [Python Tutorials](http://mxnet.io/tutorials/index.html#Python-Tutorials)
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/module.md b/docs/api/python/module.md
index ffa250406c19..5a0198dd42a4 100644
--- a/docs/api/python/module.md
+++ b/docs/api/python/module.md
@@ -1,171 +1,220 @@
 # Module API
-The module API provides an intermediate and high-level interface for performing computation with neural networks in MXNet. A *module* is an instance of subclasses of the `BaseModule`. The most widely used module class is simply called `Module`, which wraps a `Symbol` and one or more `Executors`. For a full list of functions, see  `BaseModule`.
-Each subclass of modules might have some extra interface functions. In this topic, we provide some examples of common use cases. All of the module APIs are in the `mxnet.module` namespace, simply called `mxnet.mod`.
 
-## Preparing a Module for Computation
+```eval_rst
+.. currentmodule:: mxnet.module
+```
 
-To construct a module, refer to the constructors for the specific module class. For example, the `Module` class accepts a `Symbol` as the input:
+## Overview
 
-```python
-    import mxnet as mx
+The module API, defined in the `module` (or simply `mod`) package, provides an
+intermediate and high-level interface for performing computation with a
+`Symbol`. One can roughly think a module is a machine which can execute a
+program defined by a `Symbol`.
 
-    # construct a simple MLP
-    data = mx.symbol.Variable('data')
-    fc1  = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
-    act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-    fc2  = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
-    act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-    fc3  = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-    out  = mx.symbol.SoftmaxOutput(fc3, name = 'softmax')
+The class `module.Module` is a commonly used module, which accepts a `Symbol` as
+the input:
 
-    # construct the module
-    mod = mx.mod.Module(out)
+```python
+data = mx.symbol.Variable('data')
+fc1  = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
+act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
+fc2  = mx.symbol.FullyConnected(act1, name='fc2', num_hidden=10)
+out  = mx.symbol.SoftmaxOutput(fc2, name = 'softmax')
+mod = mx.mod.Module(out)  # create a module by given a Symbol
 ```
 
-Also specify the `data_names` and `label_names` of your `Symbol`. We'll skip those parameters because our `Symbol` follows naming conventions, so the default behavior (data named as `data`, and label named as `softmax_label`) is okay. `context`, which by default is the CPU, is another important parameter. You can specify a GPU context or even a list of GPU contexts if you need data parallelization.
-
-Before you can compute with a module, you need to call `bind()` to allocate the device memory and `init_params()` or `set_params()` to initialize the parameters.
+Assume there is a valid MXNet data iterator `data`. We can initialize the
+module:
 
 ```python
-    mod.bind(data_shapes=train_dataiter.provide_data,
-         label_shapes=train_dataiter.provide_label)
-    mod.init_params()
+mod.bind(data_shapes=data.provide_data,
+         label_shapes=data.provide_label)  # create memory by given input shapes
+mod.init_params()  # initial parameters with the default random initializer
 ```
 
-Now you can compute with the module using functions like `forward()`, `backward()`, etc. If you simply want to fit a module, you don't need to call `bind()` and `init_params()` explicitly, because the `fit()` function automatically calls them if they are needed.
-
-## Training, Predicting, and Evaluating
-
-Modules provide high-level APIs for training, predicting, and evaluating. To fit a module, call the `fit()` function with some `DataIter`s:
+Now the module is able to compute. We can call high-level API to train and
+predict:
 
 ```python
-    mod = mx.mod.Module(softmax)
-    mod.fit(train_dataiter, eval_data=eval_dataiter,
-            optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
-            num_epoch=n_epoch)
+mod.fit(data, num_epoch=10, ...)  # train
+mod.predict(new_data)  # predict on new data
 ```
 
-The interface is very similar to the old `FeedForward` class. You can pass in batch-end callbacks and epoch-end callbacks. To predict with a module, call `predict()` with a `DataIter`:
+or use intermediate APIs to perform step-by-step computations
 
 ```python
-    mod.predict(val_dataiter)
+mod.forward(data_batch)  # forward on the provided data batch
+mod.backward()  # backward to calculate the gradients
+mod.update()  # update parameters using the default optimizer
 ```
 
-The module collects and returns all of the prediction results. For more details about the format of the return values, see the documentation for the `predict()` function.
+A detailed tutorial is available at [http://mxnet.io/tutorials/python/module.html](http://mxnet.io/tutorials/python/module.html).
 
-When prediction results might be too large to fit in memory, use the `iter_predict` API:
 
-```python
-    for preds, i_batch, batch in mod.iter_predict    (val_dataiter):
-        pred_label = preds[0].asnumpy().argmax(axis=1)
-        label = batch.label[0].asnumpy().astype('int32')
-        # do something...
+```eval_rst
+
+.. note:: ``module`` is used to replace ``model``, which has been deprecated.
 ```
 
-If you need to evaluate on a test set and don't need the prediction output, call the `score()` function with a `DataIter` and an `EvalMetric`:
+The `module` package provides several modules:
 
-```python
-    mod.score(val_dataiter, metric)
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    BaseModule
+    Module
+    SequentialModule
+    BucketingModule
+    PythonModule
+    PythonLossModule
 ```
 
-This runs predictions on each batch in the provided `DataIter` and computes the evaluation score using the provided `EvalMetric`. The evaluation results are stored in `metric` so that you can query later.
+We summarize the interface for each class in the following sections.
 
-## Saving and Loading Module Parameters
+## The `BaseModule` class
 
-To save the module parameters in each training epoch, use a `checkpoint` callback:
+The `BaseModule` is the base class for all other module classes. It defines the
+interface each module class should provide.
 
-```python
-    model_prefix = 'mymodel'
-    checkpoint = mx.callback.do_checkpoint(model_prefix)
+### Initialize memory
 
-    mod.fit(..., epoch_end_callback=checkpoint)
-```
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-To load the saved module parameters, call the `load_checkpoint` function:
+    BaseModule.bind
+```
 
-```python
-    sym, arg_params, aux_params = \
-        mx.model.load_checkpoint(model_prefix, n_epoch_load)
+### Get and set parameters
 
-    # assign parameters
-    mod.set_params(arg_params, aux_params)
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    BaseModule.init_params
+    BaseModule.set_params
+    BaseModule.get_params
+    BaseModule.save_params
+    BaseModule.load_params
 ```
 
-To resume training from a saved checkpoint, instead of calling `set_params()`, directly call `fit()`, passing the loaded parameters, so that `fit()` knows to start from those parameters instead of initializing randomly:
+### Train and predict
 
-```python
-    mod.fit(..., arg_params=arg_params, aux_params=aux_params,
-        begin_epoch=n_epoch_load)
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    BaseModule.fit
+    BaseModule.score
+    BaseModule.iter_predict
+    BaseModule.predict
 ```
 
-Pass in `begin_epoch` so that `fit()` knows to resume from a saved epoch.
+### Forward and backward
 
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-# Module Interface API
+    BaseModule.forward
+    BaseModule.backward
+    BaseModule.forward_backward
+```
 
+### Update parameters
 
 ```eval_rst
-    .. raw:: html
+.. autosummary::
+    :nosignatures:
 
-        <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+    BaseModule.init_optimizer
+    BaseModule.update
+    BaseModule.update_metric
 ```
 
-## BaseModule Interface API
+### Input and output
 
 ```eval_rst
-    .. automodule:: mxnet.module.base_module
-        :members:
+.. autosummary::
+    :nosignatures:
+
+    BaseModule.data_names
+    BaseModule.output_names
+    BaseModule.data_shapes
+    BaseModule.label_shapes
+    BaseModule.output_shapes
+    BaseModule.get_outputs
+    BaseModule.get_input_grads
+```
 
-    .. raw:: html
+### Others
 
-        <script>auto_index("basemodule-interface-api");</script>
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    BaseModule.get_states
+    BaseModule.set_states
+    BaseModule.install_monitor
+    BaseModule.symbol
 ```
 
-## Built-in Modules API
 
+## Other build-in modules
 
-```eval_rst
-    .. automodule:: mxnet.module.module
-    :members:
+Besides the basic interface defined in `BaseModule`, each module class supports
+additional functionality. We summarize them in this section.
 
-    .. raw:: html
+### Class `Module`
 
-        <script>auto_index("built-in-modules-api");</script>
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Module.load
+    Module.save_checkpoint
+    Module.reshape
+    Module.borrow_optimizer
+    Module.save_optimizer_states
+    Module.load_optimizer_states
 ```
 
-```eval_rst
-    .. automodule:: mxnet.module.bucketing_module
-    :members:
+### Class `BucketModule`
 
-    .. raw:: html
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-        <script>auto_index("mxnet.module.bucketing_module");</script>
+    BucketModule.switch_bucket
 ```
 
-```eval_rst
-    .. automodule:: mxnet.module.sequential_module
-        :members:
+### Class `SequentialModule`
 
-    .. raw:: html
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-        <script>auto_index("mxnet.module.sequential_module");</script>
+    SequentialModule.add
 ```
 
-## Writing Modules in Python
+## API Reference
 
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
 
 ```eval_rst
-    .. automodule:: mxnet.module.python_module
-        :members:
-
-    .. raw:: html
-
-        <script>auto_index("writing-modules-in-python");</script>
+.. autoclass:: mxnet.module.BaseModule
+    :members:
+.. autoclass:: mxnet.module.Module
+    :members:
+.. autoclass:: mxnet.module.BucketingModule
+    :members:
+.. autoclass:: mxnet.module.SequentialModule
+    :members:
+.. autoclass:: mxnet.module.PythonModule
+    :members:
+.. autoclass:: mxnet.module.PythonLossModule
+    :members:
 ```
 
-## Next Steps
-* See [Model API](model.md) for an alternative simple high-level interface for training neural networks.
-* See [Symbolic API](symbol.md) for operations on NDArrays that assemble neural networks from layers.
-* See [IO Data Loading API](io.md) for parsing and loading data.
-* See [NDArray API](ndarray.md) for vector/matrix/tensor operations.
-* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/ndarray.md b/docs/api/python/ndarray.md
index 60a9ba90493a..c4d9b06397ce 100644
--- a/docs/api/python/ndarray.md
+++ b/docs/api/python/ndarray.md
@@ -1,161 +1,468 @@
 # NDArray API
 
+```eval_rst
+.. currentmodule:: mxnet.ndarray
+```
 
-The NDArray package (`mxnet.ndarray`) contains tensor operations similar to `numpy.ndarray`. The syntax is also similar, except for some additional calls for dealing with I/O and multiple devices.
+## Overview
 
-## Create NDArray
+This document lists the routines of the *n*-dimensional array package
 
-Create `mxnet.ndarray` as follows:
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.ndarray
+```
+
+A `NDArray` is a multidimensional container of items of the same type and
+size. Various methods for data manipulation and computation are provided.
 
 ```python
-    >>> import mxnet as mx
-    >>> # all-zero array of dimension 100x50
-    >>> a = mx.nd.zeros((100, 50))
-    >>> # all-one array of dimension 256x32x128x1
-    >>> b = mx.nd.ones((256, 32, 128, 1))
-    >>> # initialize array with contents
-    >>> c = mx.nd.array([[1, 2, 3], [4, 5, 6]])
+>>> x = mx.nd.array([[1, 2, 3], [4, 5, 6]])
+>>> type(x)
+<class 'mxnet.ndarray.NDArray'>
+>>> x.shape
+(2, 3)
+>>> y = x + mx.nd.ones(x.shape)*3
+>>> print(y.asnumpy())
+[[ 4.  5.  6.]
+ [ 7.  8.  9.]]
+>>> z = y.as_in_context(mx.gpu(0))
+>>> print(z)
+<NDArray 2x3 @gpu(0)>
 ```
-This is similar to the way you use `numpy`.
-## NDArray Operations
 
-We provide some basic NDArray operations, like arithmetic and slice operations.
+A detailed tutorial is available at
+[http://mxnet.io/tutorials/python/ndarray.html](http://mxnet.io/tutorials/python/ndarray.html)..
+
+```eval_rst
 
-### Arithmetic Operations
+.. note:: ``mxnet.ndarray`` is similar to ``numpy.ndarray`` in some aspects. But the difference is not negligible. For example
 
-```python
-    >>> import mxnet as mx
-    >>> a = mx.nd.zeros((100, 50))
-    >>> a.shape
-    (100L, 50L)
-    >>> b = mx.nd.ones((100, 50))
-    >>> # c and d will be calculated in parallel here!
-    >>> c = a + b
-    >>> d = a - b
-    >>> # inplace operation, b's contents will be modified, but c and d won't be affected.
-    >>> b += d
+   - ``NDArray.T`` does real data transpose to return new a copied array, instead
+     of returning a view of the input array.
+   - ``ndarray.dot`` performs dot between the last axis of the first input array
+     and the first axis of the second input, while `numpy.dot` uses the second
+     last axis of the input array.
+
+   In additional, ``NDArray`` supports GPU computation and various neural
+   network layers.
 ```
 
-### Slice Operations
+```eval_rst
 
-```python
-    >>> import mxnet as mx
-    >>> a = mx.nd.zeros((100, 50))
-    >>> a[0:10] = 1   # first 10 rows will become 1
+.. note:: ``ndarray`` also provides almost same routines to ``symbol``. Most
+   routines between these two packages share the same C++ operator source
+   codes. But ``ndarray`` differs to ``symbol`` in several aspects:
+
+   - ``ndarray`` adopts imperative programming, namely sentences are executed
+     step-by-step so that the results can be obtained immediately.
+
+   - Most binary operators such as ``+`` and ``>`` are enabled broadcasting in
+     default.
 ```
 
-### Convert from or to numpy.ndarray
+In the rest of this document, we first overview the methods provided by the
+`ndarray.NDArray` class, and then list other routines provided by the
+`ndarray` package.
 
-MXNet NDArray provides an easy way to convert from or to `mxnet.ndarray` to or from `numpy.ndarray`:
 
-```python
-    >>> import mxnet as mx
-    >>> import numpy as np
-    >>> a = np.array([1,2,3])
-    >>> b = mx.nd.array(a)                  # convert from numpy array
-    >>> b
-    <mxnet.ndarray.NDArray object at ...>
-    >>> b.asnumpy()                         # convert to numpy array
-    array([ 1., 2., 3.], dtype=float32)
+## The `NDArray` class
+
+### Array attributes
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.shape
+    NDArray.size
+    NDArray.context
+    NDArray.dtype
+```
+
+### Array conversion
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.copy
+    NDArray.copyto
+    NDArray.as_in_context
+    NDArray.asnumpy
+    NDArray.asscalar
+    NDArray.astype
 ```
 
-### Save and Load NDArray
+### Array change shape
 
-You can use Pickle to save and load NDArrays.
-Or, you can use MXNet functions to save and load a list or dictionary of NDArrays from file systems, as follows:
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-```python
-    >>> import mxnet as mx
-    >>> a = mx.nd.zeros((100, 200))
-    >>> b = mx.nd.zeros((100, 200))
-    >>> # save list of NDArrays
-    >>> mx.nd.save("/path/to/array/file", [a, b])
-    >>> # save dictionary of NDArrays to AWS S3
-    >>> mx.nd.save("s3://path/to/s3/array", {'A' : a, 'B' : b})
-    >>> # save list of NDArrays to hdfs.
-    >>> mx.nd.save("hdfs://path/to/hdfs/array", [a, b])
-    >>> from_file = mx.nd.load("/path/to/array/file")
-    >>> from_s3 = mx.nd.load("s3://path/to/s3/array")
-    >>> from_hdfs = mx.nd.load("hdfs://path/to/hdfs/array")
+    NDArray.T
+    NDArray.reshape
+    NDArray.broadcast_to
 ```
-The good thing about using the `save` and `load` interface is that you can use the format across all `mxnet` language bindings. They also already support Amazon S3 and HDFS.
 
-### Multi-Device Support
+### Arithmetic operations
 
-Device information is stored in the `mxnet.Context` structure. When creating NDArray in MXNet, you can use either the context argument (the default is the CPU context) to create arrays on specific devices or the `with` statement, as follows:
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.__add__
+    NDArray.__sub__
+    NDArray.__rsub__
+    NDArray.__neg__
+    NDArray.__mul__
+    NDArray.__div__
+    NDArray.__rdiv__
+    NDArray.__pow__
+```
 
-```python
-    >>> import mxnet as mx
-    >>> cpu_a = mx.nd.zeros((100, 200))
-    >>> cpu_a.context
-    cpu(0)
-    >>> with mx.Context(mx.gpu(0)):
-    >>>   gpu_a = mx.nd.ones((100, 200))
-    >>> gpu_a.context
-    gpu(0)
-    >>> ctx = mx.Context(mx.gpu(0))
-    >>> gpu_b = mx.nd.zeros((100, 200), ctx)
-    >>> gpu_b.context
-    gpu(0)
-```
-
-Currently, we *do not* allow operations among arrays from different contexts. To manually enable this, use the `copyto` member function to copy the content to different devices, and continue computation:
+### In-place arithmetic operations
 
-```python
-    >>> import mxnet as mx
-    >>> x = mx.nd.zeros((100, 200))
-    >>> with mx.Context(mx.gpu(0)):
-    >>>   y = mx.nd.zeros((100, 200))
-    >>> z = x + y
-    mxnet.base.MXNetError: [13:29:12] src/ndarray/ndarray.cc:33:
-    Check failed: lhs.ctx() == rhs.ctx() operands context mismatch
-    >>> cpu_y = mx.nd.zeros((100, 200))
-    >>> y.copyto(cpu_y)
-    >>> z = x + cpu_y
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.__iadd__
+    NDArray.__isub__
+    NDArray.__imul__
+    NDArray.__idiv__
+```
+
+### Comparison operators
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.__lt__
+    NDArray.__le__
+    NDArray.__gt__
+    NDArray.__ge__
+    NDArray.__eq__
+    NDArray.__ne__
+```
+
+### Indexing
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.__getitem__
+    NDArray.__setitem__
+```
+
+### Lazy evaluation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.wait_to_read
+```
+
+## Array creation routines
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    array
+    empty
+    zeros
+    ones
+    full
+    arange
+    load
+    save
+```
+
+## Array manipulation routines
+
+### Changing array shape and type
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    cast
+    reshape
+    flatten
+    expand_dims
 ```
 
+### Expanding array elements
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    broadcast_to
+    broadcast_axes
+    repeat
+    tile
+    pad
+```
+
+### Rearranging elements
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    transpose
+    swapaxes
+    flip
+```
+
+### Joining and splitting arrays
+
 ```eval_rst
-    .. raw:: html
+.. autosummary::
+    :nosignatures:
 
-        <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+    concat
+    split
+```
+
+### Indexing routines
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    slice
+    slice_axis
+    take
+    batch_take
+    one_hot
 ```
 
-## NDArray API Reference
+## Mathematical functions
 
+### Arithmetic operations
 
 ```eval_rst
-    .. automodule:: mxnet.ndarray
-        :members:
+.. autosummary::
+    :nosignatures:
+
+    add
+    subtract
+    negative
+    multiply
+    divide
+    dot
+    batch_dot
+    add_n
+```
 
-    .. raw:: html
+### Trigonometric functions
 
-        <script>auto_index("ndarray-api-reference");</script>
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sin
+    cos
+    tan
+    arcsin
+    arccos
+    arctan
+    degrees
+    radians
 ```
 
-## NDArray Random API Reference
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sinh
+    cosh
+    tanh
+    arcsinh
+    arccosh
+    arctanh
+```
+
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sum
+    nansum
+    prod
+    nanprod
+    mean
+    max
+    min
+    norm
+```
 
+### Rounding
 
 ```eval_rst
-    .. automodule:: mxnet.random
-        :members:
+.. autosummary::
+    :nosignatures:
+
+    round
+    rint
+    fix
+    floor
+    ceil
+```
+
 
-    .. raw:: html
+### Exponents and logarithms
 
-        <script>auto_index("ndarray-random-api-reference");</script>
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    exp
+    expm1
+    log
+    log10
+    log2
+    log1p
 ```
 
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    power
+    sqrt
+    rsqrt
+    square
+```
+
+### Logic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    equal
+    not_equal
+    greater
+    greater_equal
+    lesser
+    lesser_equal
+```
+### Random sampling
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    uniform
+    normal
+    mxnet.random.seed
+```
+
+### Sorting and searching
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sort
+    topk
+    argsort
+    argmax
+    argmin
+```
+
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    maximum
+    minimum
+    clip
+    abs
+    sign
+    gamma
+    gammaln
+```
+
+## Neural network
+
+### Basic
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    FullyConnected
+    Convolution
+    Activation
+    BatchNorm
+    Pooling
+    SoftmaxOutput
+    softmax
+    log_softmax
+```
+
+### More
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Correlation
+    Deconvolution
+    RNN
+    Embedding
+    LeakyReLU
+    InstanceNorm
+    L2Normalization
+    LRN
+    ROIPooling
+    SoftmaxActivation
+    Dropout
+    BilinearSampler
+    GridGenerator
+    UpSampling
+    SpatialTransformer
+    LinearRegressionOutput
+    LogisticRegressionOutput
+    MAERegressionOutput
+    SVMOutput
+    softmax_cross_entropy
+    smooth_l1
+    IdentityAttachKLSparseReg
+    MakeLoss
+    BlockGrad
+    Custom
+```
 
-## Context API Reference
+## API Reference
 
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
 
 ```eval_rst
-    .. automodule:: mxnet.context
-        :members:
+.. automodule:: mxnet.ndarray
+    :members:
 
-    .. raw:: html
+.. automodule:: mxnet.random
+    :members:
 
-        <script>auto_index("context-api-reference");</script>
 ```
 
-## Next Steps
-* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/optimization.md b/docs/api/python/optimization.md
new file mode 100644
index 000000000000..e333b0076efa
--- /dev/null
+++ b/docs/api/python/optimization.md
@@ -0,0 +1,157 @@
+# Optimization: initialize and update weights
+
+## Overview
+
+This document summaries the APIs used to initialize and update the model weights
+during training
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.initializer
+    mxnet.optimizer
+    mxnet.lr_scheduler
+```
+
+and how to develop a new optimization algorithm in MXNet.
+
+Assume there there is a pre-defined ``Symbol`` and a ``Module`` is created for
+it
+
+```python
+>>> data = mx.symbol.Variable('data')
+>>> label = mx.symbol.Variable('softmax_label')
+>>> fc = mx.symbol.FullyConnected(data, name='fc', num_hidden=10)
+>>> loss = mx.symbol.SoftmaxOutput(fc, label, name='softmax')
+>>> mod = mx.mod.Module(loss)
+>>> mod.bind(data_shapes=[('data', (128,20))], label_shapes=[('softmax_label', (128,))])
+```
+
+Next we can initialize the weights with values sampled uniformly from
+``[-1,1]``:
+
+```python
+>>> mod.init_params(mx.initializer.Uniform(scale=1.0))
+```
+
+Then we will train a model with standard SGD which decreases the learning rate
+by multiplying 0.9 for each 100 batches.
+
+```python
+>>> lr_sch = mx.lr_scheduler.FactorScheduler(step=100, factor=0.9)
+>>> mod.init_optimizer(
+...     optimizer='sgd', optimizer_params=(('learning_rate', 0.1), ('lr_scheduler', lr_sch)))
+```
+
+Finally run ``mod.fit(...)`` to start training.
+
+## The ``mxnet.initializer`` package
+
+```eval_rst
+.. currentmodule:: mxnet.initializer
+```
+
+The base class ``Initializer`` defines the default behaviors to initialize
+various parameters, such as set bias to 1, except for the weight. Other classes
+then defines how to initialize the weight.
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Initializer
+    Uniform
+    Normal
+    Load
+    Mixed
+    Zero
+    One
+    Constant
+    Orthogonal
+    Xavier
+    MSRAPrelu
+    Bilinear
+    FusedRNN
+```
+
+## The ``mxnet.optimizer`` package
+
+```eval_rst
+.. currentmodule:: mxnet.optimizer
+```
+
+The base class ``Optimizer`` accepts commonly shared arguments such as
+``learning_rate`` and defines the interface. Each other class in this package
+implements one weight updating function.
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Optimizer
+    SGD
+    NAG
+    RMSProp
+    Adam
+    AdaGrad
+    AdaDelta
+    DCASGD
+    SGLD
+```
+
+## The ``mxnet.lr_scheduler`` package
+
+```eval_rst
+.. currentmodule:: mxnet.lr_scheduler
+```
+
+The base class ``LRScheduler`` defines the interface, while other classes
+implement various schemes to change the learning rate during training.
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    LRScheduler
+    FactorScheduler
+    MultiFactorScheduler
+```
+
+## Implement a new algorithm
+
+Most classes listed in this document are implemented in Python by using ``NDArray``.
+So implementing new weight updating or initialization functions is
+straightforward.
+
+For `initializer`, create a subclass of ``Initializer`` and define the
+`_init_weight` method. We can also change the default behaviors to initialize
+other parameters such as `_init_bias`. See
+[`initializer.py`](https://github.com/dmlc/mxnet/blob/master/python/mxnet/initializer.py)
+for examples.
+
+For ``optimizer``, create a subclass of ``Optimizer``
+and implement two methods ``create_state`` and ``update``. Also add
+``@mx.optimizer.Optimizer.register`` before this class. See
+[`optimizer.py`](https://github.com/dmlc/mxnet/blob/master/python/mxnet/optimizer.py)
+for examples.
+
+For `lr_scheduler`, create a subclass of `LRScheduler` and then implement the
+`__call__` method. See
+[`lr_scheduler.py`](https://github.com/dmlc/mxnet/blob/master/python/mxnet/lr_scheduler.py)
+for examples.
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.optimizer
+    :members:
+.. automodule:: mxnet.lr_scheduler
+    :members:
+.. automodule:: mxnet.initializer
+    :members:
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/symbol.md b/docs/api/python/symbol.md
index 5b49c069a605..b4d05f604bbe 100644
--- a/docs/api/python/symbol.md
+++ b/docs/api/python/symbol.md
@@ -1,209 +1,456 @@
 # Symbol API
 
-Topics:
+```eval_rst
+    .. currentmodule:: mxnet.symbol
+```
 
-* [How to Compose Symbols](#overloaded-operators) introduces operator overloading of symbols.
-* [Symbol Attributes](#symbol-attributes) describes how to attach attributes to symbols.
-* [Serialization](#serialization) explains how to save and load symbols.
-* [Executing Symbols](#executing-symbols) explains how to evaluate the symbols with data.
-* [Execution API Reference](#execution-api-reference) documents the execution APIs.
-* [Multiple Outputs](#multiple-outputs) explains how to configure multiple outputs.
-* [Symbol Creation API Reference](#symbol-creation-api-reference) documents functions.
-* [Symbol Object Document](#mxnet.symbol.Symbol) documents the Symbol object.
-* [Testing Utility Reference](#testing-utility-reference) documents the testing utilities.
+## Overview
 
-We also highly encourage you to read [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md).
+This document lists the routines of the symbolic expression package:
 
-## How to Compose Symbols
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-The symbolic API provides a way to configure computation graphs.
-You can configure the graphs either at the level of neural network layer operations or as fine-grained operations.
+    mxnet.symbol
+```
 
-The following example configures a two-layer neural network.
+A symbol declares computation. It is composited by
+operators, such as simple matrix operations (e.g. “+”), or a neural network
+layer (e.g. convolution layer). We can bind data to a symbol to execute the
+computation.
 
 ```python
-    >>> import mxnet as mx
-    >>> net = mx.symbol.Variable('data')
-    >>> net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
-    >>> net = mx.symbol.Activation(data=net, name='relu1', act_type="relu")
-    >>> net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
-    >>> net = mx.symbol.SoftmaxOutput(data=net, name='out')
-    >>> type(net)
-    <class 'mxnet.symbol.Symbol'>
+>>> a = mx.sym.Variable('a')
+>>> b = mx.sym.Variable('b')
+>>> c = 2 * a + b
+>>> type(c)
+<class 'mxnet.symbol.Symbol'>
+>>> e = c.bind(mx.cpu(), {'a': mx.nd.array([1,2]), 'b':mx.nd.array([2,3])})
+>>> y = e.forward()
+>>> y
+[<NDArray 2 @cpu(0)>]
+>>> y[0].asnumpy()
+array([ 4.,  7.], dtype=float32)
 ```
 
-The basic arithmetic operators (plus, minus, div, multiplication) are overloaded for
-*element-wise operations* of symbols.
+A detailed tutorial is available at [http://mxnet.io/tutorials/python/symbol.html](http://mxnet.io/tutorials/python/symbol.html).
 
-The following example creates a computation graph that adds two inputs together.
 
-```python
-    >>> import mxnet as mx
-    >>> a = mx.symbol.Variable('a')
-    >>> b = mx.symbol.Variable('b')
-    >>> c = a + b
+```eval_rst
+
+.. note:: most operators provided in ``symbol`` are similar to ``ndarray``. But
+   also note that ``symbol`` differs to ``ndarray`` in several aspects:
+
+   - ``symbol`` adopts declare programming. In other words, we need to first
+     composite the computations, and then feed with data to execute.
+
+   - Most binary operators such as ``+`` and ``>`` are not enabled broadcasting.
+     We need to call the broadcasted version such as ``broadcast_plus``
+     explicitly.
+
 ```
 
-## Symbol Attributes
+In the rest of this document, we first overview the methods provided by the
+`symbol.Symbol` class, and then list other routines provided by the
+`symbol` package.
 
-You can add an attribute to a symbol by providing an attribute dictionary when you create a symbol.
+## The `Symbol` class
 
-```python
-    data = mx.sym.Variable('data', attr={'mood': 'angry'})
-    op   = mx.sym.Convolution(data=data, name='conv', kernel=(1, 1),
-                              num_filter=1, attr={'mood': 'so so'})
+### Composition
+
+Composite multiple symbols into a new one by an operator.
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.__call__
 ```
-For proper communication with the C++ backend, both the key and values of the attribute dictionary should be strings. To retrieve the attributes, use `attr(key)` or `list_attr()`:
 
+#### Arithmetic operations
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.__add__
+    Symbol.__sub__
+    Symbol.__rsub__
+    Symbol.__neg__
+    Symbol.__mul__
+    Symbol.__div__
+    Symbol.__rdiv__
+    Symbol.__pow__
 ```
-    assert data.attr('mood') == 'angry'
-    assert op.list_attr() == {'mood': 'so so'}
+
+#### Comparison operators
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.__lt__
+    Symbol.__le__
+    Symbol.__gt__
+    Symbol.__ge__
+    Symbol.__eq__
+    Symbol.__ne__
 ```
-For a composite symbol, you can retrieve all of the attributes associated with that symbol *and its descendants* with `list_attr(recursive=True)`. In the returned dictionary, all of the attribute names have the prefix `'symbol_name' + '_'` to prevent naming conflicts.
 
-```python
-    assert op.list_attr(recursive=True) == {'data_mood': 'angry', 'conv_mood': 'so so',
-                                             'conv_weight_mood': 'so so', 'conv_bias_mood': 'so so'}
+### Query information
+
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.name
+    Symbol.list_arguments
+    Symbol.list_outputs
+    Symbol.list_auxiliary_states
+    Symbol.list_attr
+    Symbol.attr
+    Symbol.attr_dict
 ```
-Notice that the `mood` attribute set for the ```Convolution``` operator is copied to `conv_weight` and `conv_bias`. They're symbols that are automatically created by the ```Convolution``` operator, and the attributes are automatically copied for them. This is especially useful for annotating context groups in model parallelism. However, if you explicitly specify the weight or bias symbols, the attributes for the host operator are *not* copied to them:
 
-```python
-    weight = mx.sym.Variable('crazy_weight', attr={'size': '5'})
-    data = mx.sym.Variable('data', attr={'mood': 'angry'})
-    op = mx.sym.Convolution(data=data, weight=weight, name='conv', kernel=(1, 1),
-                                  num_filter=1, attr= {'mood': 'so so'})
-    op.list_attr(recursive=True)
-    # =>
-    # {'conv_mood': 'so so',
-    #  'conv_bias_mood': 'so so',
-    #  'crazy_weight_size': '5',
-    #  'data_mood': 'angry'}
+### Get internal and output symbol
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.__getitem__
+    Symbol.__iter__
+    Symbol.get_internals
+    Symbol.get_children
 ```
-As you can see, the `mood` attribute is copied to the symbol `conv_bias`, which was automatically created, but not to the manually created weight symbol `crazy_weight`.
 
-Another way to attach attributes is to use ```AttrScope```. ```AttrScope``` automatically adds the specified attributes to all of the symbols created within that scope. For example:
+### Inference type and shape
 
-```python
-    data = mx.symbol.Variable('data')
-    with mx.AttrScope(group='4', data='great'):
-        fc1 = mx.symbol.Activation(data, act_type='relu')
-        with mx.AttrScope(init_bias='0.0'):
-            fc2 = mx.symbol.FullyConnected(fc1, num_hidden=10, name='fc2')
-    assert fc1.attr('data') == 'great'
-    assert fc2.attr('data') == 'great'
-    assert fc2.attr('init_bias') == '0.0'
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.infer_type
+    Symbol.infer_shape
+    Symbol.infer_shape_partial
 ```
 
-**Naming convention**: We recommend that you choose valid variable names for attribute names. Names with double underscores (e.g., `__shape__`) are reserved for internal use. The underscore `'_'` separates a symbol name and its attributes. It's also the separator between a symbol and a variable that is automatically created by that symbol. For example, the `weight` variable that is created automatically by a ```Convolution``` operator named `conv1` is called `conv1_weight`.
 
-**Components that use attributes**: More and more components are using symbol attributes to collect useful annotations for the computational graph. Here is a (probably incomplete) list:
+### Bind
 
-- ```Variable``` uses attributes to store (optional) shape information for a variable.
-- Optimizers read `__lr_mult__` and `__wd_mult__` attributes for each symbol in a computational graph. This is useful to control per-layer learning rate and decay.
-- The model parallelism LSTM example uses the `__ctx_group__` attribute to divide the operators into groups that correspond to GPU devices.
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-## Serialization
+    Symbol.bind
+    Symbol.simple_bind
+```
 
-There are two ways to save and load the symbols. You can use Pickle to serialize the ```Symbol``` objects.
-Or, you can use the [mxnet.symbol.Symbol.save](#mxnet.symbol.Symbol.save) and [mxnet.symbol.load](#mxnet.symbol.load) functions.
-The advantage of using the `save` and `load` functions is that this method is language agnostic and cloud friendly.
-The symbol is saved in JSON format. You can also get a JSON string directly using [mxnet.symbol.Symbol.tojson](#mxnet.symbol.Symbol.tojson).
+### Save
 
-The following example shows how to save a symbol to an S3 bucket, load it back, and compare two symbols using a JSON string.
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-```python
-    >>> import mxnet as mx
-    >>> a = mx.symbol.Variable('a')
-    >>> b = mx.symbol.Variable('b')
-    >>> c = a + b
-    >>> c.save('s3://my-bucket/symbol-c.json')
-    >>> c2 = mx.symbol.load('s3://my-bucket/symbol-c.json')
-    >>> c.tojson() == c2.tojson()
-    True
+    Symbol.save
+    Symbol.tojson
+    Symbol.debug_str
 ```
 
-## Executing Symbols
+## Symbol creation routines
 
-After you have assembled a set of symbols into a computation graph, the MXNet engine can evaluate them.
-If you are training a neural network, this is typically
-handled by the high-level [Model class](model.md) and the [`fit()`](model.html#mxnet.model.FeedForward.fit) function.
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-For neural networks used in "feed-forward", "prediction", or "inference" mode (all terms for the same
-thing: running a trained network), the input arguments are the
-input data, and the weights of the neural network that were learned during training.  
+    var
+    zeros
+    ones
+    arange
+```
 
-To manually execute a set of symbols, you need to create an [`Executor`](#mxnet.executor.Executor) object,
-which is typically constructed by calling the [`simple_bind()`](#mxnet.symbol.Symbol.simple_bind) method on a symbol.  
-For an example of this, see the sample
-[`notebook on how to use simple_bind()`](https://github.com/dmlc/mxnet-notebooks/blob/master/python/moved-from-mxnet/simple_bind.ipynb).
+## Symbol manipulation routines
 
+### Changing shape and type
 
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-## Multiple Outputs
+    cast
+    reshape
+    flatten
+    expand_dims
+```
 
-To group the symbols together, use the [mxnet.symbol.Group](#mxnet.symbol.Group) function.
+### Expanding elements
 
-```python
-    >>> import mxnet as mx
-    >>> net = mx.symbol.Variable('data')
-    >>> fc1 = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
-    >>> net = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu")
-    >>> net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
-    >>> out = mx.symbol.SoftmaxOutput(data=net, name='softmax')
-    >>> group = mx.symbol.Group([fc1, out])
-    >>> group.list_outputs()
-    ['fc1_output', 'softmax_output']
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    broadcast_to
+    broadcast_axes
+    repeat
+    tile
+    pad
 ```
 
-After you get the ```group```, you can bind on ```group``` instead.
-The resulting executor will have two outputs, one for fc1_output and one for softmax_output.
+### Rearranging elements
 
 ```eval_rst
-    .. raw:: html
+.. autosummary::
+    :nosignatures:
 
-        <script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+    transpose
+    swapaxes
+    flip
 ```
 
-## Symbol Creation API Reference
+### Joining and splitting symbols
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    concat
+    split
+```
 
+### Indexing routines
 
 ```eval_rst
-    .. automodule:: mxnet.symbol
-       :members:
+.. autosummary::
+    :nosignatures:
+
+    slice
+    slice_axis
+    take
+    batch_take
+    one_hot
+```
+
+## Mathematical functions
+
+### Arithmetic operations
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    broadcast_add
+    broadcast_sub
+    broadcast_mul
+    broadcast_div
+    negative
+    dot
+    batch_dot
+    add_n
+```
+
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sin
+    cos
+    tan
+    arcsin
+    arccos
+    arctan
+    hypot
+    broadcast_hypot
+    degrees
+    radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sinh
+    cosh
+    tanh
+    arcsinh
+    arccosh
+    arctanh
+```
+
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sum
+    nansum
+    prod
+    nanprod
+    mean
+    max
+    min
+    norm
+```
+
+### Rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    round
+    rint
+    fix
+    floor
+    ceil
+```
 
-    .. raw:: html
 
-        <script>auto_index("symbol-creation-api-reference");</script>
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    exp
+    expm1
+    log
+    log10
+    log2
+    log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    broadcast_power
+    sqrt
+    rsqrt
+    square
+```
+
+### Logic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    broadcast_equal
+    broadcast_not_equal
+    broadcast_greater
+    broadcast_greater_equal
+    broadcast_lesser
+    broadcast_lesser_equal
 ```
+### Random sampling
 
+```eval_rst
+.. autosummary::
+    :nosignatures:
 
-## Execution API Reference
+    uniform
+    normal
+    mxnet.random.seed
+```
 
+### Sorting and searching
 
 ```eval_rst
-    .. automodule:: mxnet.executor
-       :members:
+.. autosummary::
+    :nosignatures:
+
+    sort
+    topk
+    argsort
+    argmax
+    argmin
+```
 
-    .. raw:: html
+### Miscellaneous
 
-        <script>auto_index("execution-api-reference");</script>
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    maximum
+    minimum
+    broadcast_maximum
+    broadcast_minimum
+    clip
+    abs
+    sign
+    gamma
+    gammaln
 ```
 
+## Neural network
 
-## Testing Utility Reference
+### Basic
 
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    FullyConnected
+    Convolution
+    Activation
+    BatchNorm
+    Pooling
+    SoftmaxOutput
+    softmax
+    log_softmax
+```
+
+### More
 
 ```eval_rst
-    .. automodule:: mxnet.test_utils
-        :members:
+.. autosummary::
+    :nosignatures:
+
+    Correlation
+    Deconvolution
+    RNN
+    Embedding
+    LeakyReLU
+    InstanceNorm
+    L2Normalization
+    LRN
+    ROIPooling
+    SoftmaxActivation
+    Dropout
+    BilinearSampler
+    GridGenerator
+    UpSampling
+    SpatialTransformer
+    LinearRegressionOutput
+    LogisticRegressionOutput
+    MAERegressionOutput
+    SVMOutput
+    softmax_cross_entropy
+    smooth_l1
+    IdentityAttachKLSparseReg
+    MakeLoss
+    BlockGrad
+    Custom
+```
+
+## API Reference
 
-    .. raw:: html
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.symbol
+    :members:
 
-        <script>auto_index("testing-utility-reference");</script>
 ```
 
-## Next Steps
-* [Symbolic Configuration and Execution in Pictures](http://mxnet.io/api/python/symbol_in_pictures.html).
-* See [IO Data Loading API](io.md) for parsing and loading data.
-* See [NDArray API](ndarray.md) for vector/matrix/tensor operations.
-* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/scala/index.md b/docs/api/scala/index.md
index 9fa5f5d72c7d..e774257e6f0c 100644
--- a/docs/api/scala/index.md
+++ b/docs/api/scala/index.md
@@ -26,6 +26,7 @@ You can perform tensor or matrix computation in pure Scala:
 ```
 
  ## Scala API Reference
+ * [Module API](module.md) is a flexible high-level interface for training neural networks.
  * [Model API](model.md) is an alternate simple high-level interface for training neural networks.
  * [Symbolic API](symbol.md) performs operations on NDArrays to assemble neural networks from layers.
  * [IO Data Loading API](io.md) performs parsing and data loading.
diff --git a/docs/api/scala/module.md b/docs/api/scala/module.md
new file mode 100644
index 000000000000..8e9273fbf4a2
--- /dev/null
+++ b/docs/api/scala/module.md
@@ -0,0 +1,138 @@
+# Module API
+The module API provides an intermediate and high-level interface for performing computation with neural networks in MXNet. A *module* is an instance of subclasses of the `BaseModule`. The most widely used module class is called `Module`. Module wraps a `Symbol` and one or more `Executors`. For a full list of functions, see `BaseModule`.
+A subclass of modules might have extra interface functions. This topic provides some examples of common use cases. All of the module APIs are in the `Module` namespace.
+
+## Preparing a Module for Computation
+
+To construct a module, refer to the constructors for the module class. For example, the `Module` class accepts a `Symbol` as input:
+
+```scala
+    import ml.dmlc.mxnet._
+    import ml.dmlc.mxnet.module.{FitParams, Module}
+
+    // construct a simple MLP
+    val data = Symbol.Variable("data")
+    val fc1 = Symbol.FullyConnected(name = "fc1")(data)(Map("num_hidden" -> 128))
+    val act1 = Symbol.Activation(name = "relu1")(fc1)(Map("act_type" -> "relu"))
+    val fc2 = Symbol.FullyConnected(name = "fc2")(act1)(Map("num_hidden" -> 64))
+    val act2 = Symbol.Activation(name = "relu2")(fc2)(Map("act_type" -> "relu"))
+    val fc3 = Symbol.FullyConnected(name = "fc3")(act2)(Map("num_hidden" -> 10))
+    val out = Symbol.SoftmaxOutput(name = "softmax")(fc3)()
+
+    // construct the module
+    val mod = new Module(out)
+```
+
+By default, `context` is the CPU. If you need data parallelization, you can specify a GPU context or an array of GPU contexts.
+
+Before you can compute with a module, you need to call `bind()` to allocate the device memory and `initParams()` or `SetParams()` to initialize the parameters.
+If you simply want to fit a module, you don't need to call `bind()` and `initParams()` explicitly, because the fit() function automatically calls them if they are needed.
+
+```scala
+    mod.bind(dataShapes = train_dataiter.provideData, labelShapes = Some(train_dataiter.provideLabel))
+    mod.initParams()
+```
+
+Now you can compute with the module using functions like `forward()`, `backward()`, etc.
+
+## Training, Predicting, and Evaluating
+
+Modules provide high-level APIs for training, predicting, and evaluating. To fit a module, call the `fit()` function with some `DataIter`s:
+
+```scala
+    import ml.dmlc.mxnet.optimizer.SGD
+    val mod = new Module(softmax)
+
+    mod.fit(train_dataiter, evalData = scala.Option(eval_dataiter), \
+    numEpoch = n_epoch, fitParams = new FitParams()\
+    .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f)))
+```
+
+The interface is very similar to the old `FeedForward` class. You can pass in batch-end callbacks using `setBatchEndCallback` and epoch-end callbacks using `setEpochEndCallback`. You can also set parameters using methods like `setOptimizer` and `setEvalMetric`. To learn more about the `FitParams()`, see the [API page](http://mxnet.io/api/scala/docs/index.html#ml.dmlc.mxnet.module.FitParams). To predict with a module, call `predict()` with a `DataIter`:
+
+```scala
+    mod.predict(val_dataiter)
+```
+
+The module collects and returns all of the prediction results. For more details about the format of the return values, see the documentation for the [`predict()` function](http://mxnet.io/api/scala/docs/index.html#ml.dmlc.mxnet.module.BaseModule).
+
+When prediction results might be too large to fit in memory, use the `predictEveryBatch` API:
+
+```scala
+    val preds = mod.predictEveryBatch(val_dataiter)
+    val_dataiter.reset()
+    var i = 0
+    while (val_dataiter.hasNext) {
+       val batch = val_dataiter.next()
+       val predLabel: Array[Int] = NDArray.argmax_channel(preds(i)(0)).toArray.map(_.toInt)
+       val label = batch.label(0).toArray.map(_.toInt)
+       //do something...
+       i += 1
+    }
+```
+
+If you need to evaluate on a test set and don't need the prediction output, call the `score()` function with a `DataIter` and an `EvalMetric`:
+
+```scala
+    mod.score(val_dataiter, metric)
+```
+
+This runs predictions on each batch in the provided `DataIter` and computes the evaluation score using the provided `EvalMetric`. The evaluation results are stored in `metric` so that you can query later.
+
+## Saving and Loading Module Parameters
+
+To save the module parameters in each training epoch, use a `checkpoint` callback:
+
+```scala
+    val modelPrefix: String = "mymodel"
+
+    for (epoch <- 0 until 5) {
+      while(train_dataiter.hasNext){  
+          // forward backward pass
+         //do something...
+       }
+        val checkpoint = mod.saveCheckpoint(modelPrefix, epoch, saveOptStates = true)
+
+    }
+```
+
+To load the saved module parameters, call the `loadCheckpoint` function:
+
+```scala
+    val mod = Module.loadCheckpoint(modelPrefix, loadModelEpoch, loadOptimizerStates = true)
+```
+
+To initialize parameters, Bind the symbols to construct executors first with `bind` method. Then, initialize the parameters and auxiliary states by calling `initParams()` method.
+
+```scala
+    mod.bind(dataShapes = train_dataiter.provideData, labelShapes = Some(train_dataiter.provideLabel))
+    mod.initParams()
+```
+
+To get current parameters, use `getParams` method.
+
+```scala
+    val (argParams, auxParams) = mod.getParams
+```
+
+To assign parameter and aux state values, use `setParams` method.
+
+```scala
+    mod.setParams(argParams, auxParams)
+```
+
+To resume training from a saved checkpoint, instead of calling `setParams()`, directly call `fit()`, passing the loaded parameters, so that `fit()` knows to start from those parameters instead of initializing randomly:
+
+```scala
+    mod.fit(..., fitParams=new FitParams().setArgParams(argParams).\
+    setAuxParams(auxParams).setBeginEpoch(beginEpoch))
+```
+
+Create an object of the `FitParams()` class, and then use it to call the `setBeginEpoch()` method to pass `beginEpoch` so that `fit()` knows to resume from a saved epoch.
+
+## Next Steps
+* See [Model API](model.md) for an alternative simple high-level interface for training neural networks.
+* See [Symbolic API](symbol.md) for operations on NDArrays that assemble neural networks from layers.
+* See [IO Data Loading API](io.md) for parsing and loading data.
+* See [NDArray API](ndarray.md) for vector/matrix/tensor operations.
+* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
diff --git a/docs/architecture/program_model.md b/docs/architecture/program_model.md
index 3cc8d08352b7..26a97a03e4ff 100644
--- a/docs/architecture/program_model.md
+++ b/docs/architecture/program_model.md
@@ -237,7 +237,7 @@ As you can see, the trade-off between restriction and flexibility is the same fo
 It's important to able to save a model and load it back later. There are different ways to *save* your work.
 Normally, to save a neural network, you need to save two things: a net configuration for the structure of the neural network and the weights of the neural network.
 
-The ability to check the configuration is a plus for symbolic programs. Because the symbolic construction phase does perform computation,
+The ability to check the configuration is a plus for symbolic programs. Because the symbolic construction phase does not perform computation,
 you can directly serialize the computation graph, and load it back later. This solves the problem of saving the configuration without introducing an additional layer.
 
 ```python
diff --git a/docs/get_started/amazonlinux_setup.md b/docs/get_started/amazonlinux_setup.md
index 07e78fb4a3a6..7251308afa23 100644
--- a/docs/get_started/amazonlinux_setup.md
+++ b/docs/get_started/amazonlinux_setup.md
@@ -126,6 +126,7 @@ We have installed MXNet core library. Next, we will install MXNet interface pack
 - [R](#install-the-mxnet-package-for-r)
 - [Julia](#install-the-mxnet-package-for-julia)
 - [Scala](#install-the-mxnet-package-for-scala)
+- [Perl](#install-the-mxnet-package-for-perl)
 
 ### Install the MXNet Package for Python
 Next, we install Python interface for MXNet. Assuming you are in `~/mxnet` directory, run below commands.
@@ -247,6 +248,31 @@ To install the MXNet Scala package into your local Maven repository, run the fol
   make scalainstall
 ```
 
+### Install the MXNet Package for Perl
+
+Before you build MXNet for Scala from source code, you must complete [building the shared library](#build-the-shared-library). After you build the shared library, run the following command from the MXNet source root directory to build the MXNet Scala package:
+
+```bash
+    ## install PDL, Graphviz, Mouse, App::cpanminus, swig via yum before running these commands
+    cpanm -q -L "${HOME}/perl5" Function::Parameters
+
+    MXNET_HOME=${PWD}
+    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
+    export PERL5LIB=${HOME}/perl5/lib/perl5
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNet/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+```
+
 **Note - ** You are more than welcome to contribute easy installation scripts for other operating systems and programming languages, see [community page](http://mxnet.io/community/index.html) for contributors guidelines.
 
 ## Next Steps
diff --git a/docs/get_started/build_from_source.md b/docs/get_started/build_from_source.md
new file mode 100644
index 000000000000..8ec9a0f8e777
--- /dev/null
+++ b/docs/get_started/build_from_source.md
@@ -0,0 +1,462 @@
+# Build MXNet from Source
+
+This document explains how to build MXNet from source codes for the following
+platforms.
+The build process contains two steps:
+
+1. Build the shared `libmxnet` library from [C++ source files](#build-the-shared-library)
+2. Select a language package, such as [Python](#build-the-python-package),
+   [Scala](#build-the-scala-package), [R](#build-the-r-package), and
+   [Julia](#build-the-julia-package), to build and install.
+
+## Build the shared library
+
+### Prerequisites
+
+The minimum requirements to build MXNet's shared library include C++ build tools
+and a BLAS library. There are optional dependencies for enhanced features.
+
+#### C++ build tools
+
+1. A C++ compiler that supports C++ 11 such as
+[G++ (4.8 or later)](https://gcc.gnu.org/gcc-4.8/) and
+[Clang](http://clang.llvm.org/) is required.
+
+2. [Git](https://git-scm.com/downloads) for downloading the source codes
+
+3. [GNU Make](https://www.gnu.org/software/make/) ([cmake](https://cmake.org/)
+   for Windows) to build the library.
+
+
+Please select the platform you prefer:
+<div class="btn-group opt-group" role="group">
+<button type="button" class="btn btn-default opt active">Linux</button>
+<button type="button" class="btn btn-default opt">macOS</button>
+<button type="button" class="btn btn-default opt">Windows</button>
+</div>
+<script type="text/javascript" src='../../_static/js/options.js'></script>
+
+<div class="linux">
+
+Then select the Linux distribution:
+<div class="btn-group opt-group" role="group">
+<button type="button" class="btn btn-default opt active">Ubuntu</button>
+<button type="button" class="btn btn-default opt">CentOS</button>
+<button type="button" class="btn btn-default opt">Others</button>
+</div>
+
+- **Ubuntu** for systems supporting the `apt-get`
+  package management program
+- **CentOS** for systems supporting the `yum` package
+  management program
+- **Others** for general Linux-like systems building dependencies from scratch.
+
+<div class="ubuntu">
+
+For `Ubuntu >= 13.10` and `Debian >= 8` you can install them by
+
+```bash
+sudo apt-get update && sudo apt-get install build-essential git
+```
+
+</div>
+
+<div class="centos">
+
+For `CentOS >= 7` and `Fedora >= 19`, you can install them by
+
+```bash
+sudo yum groupinstall -y "Development Tools" && sudo yum git
+```
+
+</div>
+
+<div class="others">
+
+Installing both `git` and `make` by following instructions on the websites is
+straightforward. Here we provide the instructions to build `gcc-4.8` from source codes.
+
+1. Install the 32-bit `libc` with one of the following system-specific commands:
+
+   ```bash
+   sudo apt-get install libc6-dev-i386 # In Ubuntu
+   sudo yum install glibc-devel.i686   # In RHEL (Red Hat Linux)
+   sudo yum install glibc-devel.i386   # In CentOS 5.8
+   sudo yum install glibc-devel.i686   # In CentOS 6/7
+   ```
+
+2. Download and extract the `gcc` source code with the prerequisites:
+
+   ```bash
+   wget http://mirrors.concertpass.com/gcc/releases/gcc-4.8.5/gcc-4.8.5.tar.gz
+   tar -zxf gcc-4.8.5.tar.gz
+   cd gcc-4.8.5
+   ./contrib/download_prerequisites
+   ```
+
+3. Build `gcc` by using 10 threads and then install to `/usr/local`
+
+   ```bash
+   mkdir release && cd release
+   ../configure --prefix=/usr/local --enable-languages=c,c++
+   make -j10
+   sudo make install
+   ```
+
+4. Add the lib path to your configure file such as `~/.bashrc`:
+
+   ```bash
+   export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib64
+   ```
+
+</div>
+</div> <!-- linux -->
+
+<div class="windows">
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. DownLoad and Install [CMake](https://cmake.org/) if it is not already installed.
+
+</div>
+
+<div class="macos">
+
+Install [Xcode](https://developer.apple.com/xcode/).
+
+</div>
+
+#### BLAS library
+
+MXNet relies on the
+[BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) (Basic
+Linear Algebra Subprograms) library for numerical computations. Installing
+any one among [ATLAS](http://math-atlas.sourceforge.net/),
+[OpenBLAS](http://www.openblas.net/) and
+[MKL](https://software.intel.com/en-us/intel-mkl) is enough.
+
+<div class="linux">
+<div class="ubuntu">
+
+```bash
+sudo apt-get install libatlas-base-dev
+```
+
+</div>
+
+<div class="centos">
+
+```bash
+sudo yum install atlas-devel
+```
+
+</div>
+
+<div class="linux">
+
+One can follow this link to build
+[OpenBlas from source](https://github.com/xianyi/OpenBLAS#installation-from-source).
+
+</div>
+</div>
+
+<div class="macos">
+
+macOS users can skip this step because `xcode` ships with a BLAS library.
+
+</div>
+
+<div class="windows">
+
+1. Download pre-build binaries for [OpenBLAS](https://sourceforge.net/projects/openblas/files/)
+2. Set the environment variable `OpenBLAS_HOME` to point to the OpenBLAS
+   directory that contains the `include/` and `lib/` directories. Typically, you
+   can find the directory in `C:\Program files (x86)\OpenBLAS\`.
+
+</div>
+
+#### Optional: [OpenCV](http://opencv.org/) for Image Loading and Augmentation
+
+<div class="linux">
+<div class="ubuntu">
+
+```bash
+sudo apt-get install libopencv-dev
+```
+
+</div>
+
+<div class="centos">
+
+```bash
+sudo apt-get install opencv-devel
+```
+
+</div>
+
+<div class="others">
+
+To build OpenCV from source code, you need the [cmake](https://cmake.org) library .
+
+1. If you don't have cmake or if your version of cmake is earlier than 3.6.1, run the following commands to install a newer version of cmake:
+
+   ```bash
+   wget https://cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.tar.gz
+   tar -zxvf cmake-3.6.1-Linux-x86_64.tar.gz
+   alias cmake="cmake-3.6.1-Linux-x86_64/bin/cmake"
+   ```
+
+2. To download and extract the OpenCV source code, run the following commands:
+
+   ```bash
+   wget https://codeload.github.com/opencv/opencv/zip/2.4.13
+   unzip 2.4.13
+   cd opencv-2.4.13
+   mkdir release
+   cd release/
+   ```
+
+3. Build OpenCV. The following commands build OpenCV with 10 threads. We
+   disabled GPU support, which might significantly slow down an MXNet program
+   running on a GPU processor. It also disables 1394 which might generate a
+   warning. Then install it on `/usr/local`.
+
+   ```bash
+   cmake -D BUILD_opencv_gpu=OFF -D WITH_CUDA=OFF -D WITH_1394=OFF -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local ..
+   make -j10
+   sudo make install
+   ```
+
+4. Add the lib path to your configuration such as `~/.bashrc`.
+
+   ```bash
+   export PKG_CONFIG_PATH=$PKG_CONFIG_PATH:/usr/local/lib/pkgconfig/
+   ```
+
+</div>
+</div>
+
+<div class="windows">
+
+First download and install [OpenCV](http://opencv.org/releases.html), then set
+the environment variable `OpenCV_DIR` to point to the OpenCV build directory.
+
+</div>
+
+#### Optional: [CUDA](https://developer.nvidia.com/cuda-downloads)/[cuDNN](https://developer.nvidia.com/cudnn) for Nvidia GPUs
+
+Both CUDA 7.5 and 8.0 are tested. For cuDNN we suggest to use version 5.
+
+<div class="linux">
+<div class="ubuntu">
+
+Install CUDA 7.5 and cuDNN 5 on Ubuntu 14.04
+
+```bash
+wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb
+sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb
+echo "deb http://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1404/x86_64 /" | sudo tee /etc/apt/sources.list.d/nvidia-ml.list
+sudo apt-get update
+sudo apt-get install -y linux-image-extra-`uname -r` linux-headers-`uname -r` linux-image-`uname -r`
+sudo apt-get install -y cuda libcudnn5-dev=5.0.5-1+cuda7.5
+```
+
+</div>
+</div>
+
+### Build
+
+<div class="linux macos">
+
+First clone the recent codes
+
+```bash
+git clone --recursive https://github.com/dmlc/mxnet
+cd mxnet
+```
+
+File
+[`make/config.mk`](https://github.com/dmlc/mxnet/blob/master/make/config.mk)
+contains all compilation options. You can edit it and then `make`. There are
+some example build options
+
+</div>
+
+<div class="linux">
+
+- Build without using OpenCV. `-j` means using multithread to build.
+
+  ```bash
+  make -j USE_OPENCV=0
+  ```
+
+
+- Build with both GPU and Opencv supports
+
+  ```bash
+  make -j USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
+  ```
+
+</div>
+
+<div class="macos">
+
+- Build with the default BLAS library and clang installed with `xcode` (OPENMP
+  is disabled because it is not supported in default by clang).
+
+  ```bash
+  make -j USE_BLAS=apple UES_OPENCV=0 USE_OPENMP=0
+  ```
+
+</div>
+
+<div class="windows">
+
+Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
+
+In Visual Studio, open the solution file,```.sln```, and compile it.
+These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+
+</div>
+
+## Build the Python package
+
+The Python package requires both `python` and `numpy`.
+
+<div class="ubuntu">
+
+The following command install the minimal requirement.
+
+```bash
+sudo apt-get install python-dev python-numpy
+```
+
+[This script](../../docker/install/python.sh) installs both Python 2 and 3 and
+other python libraries for MXNet.
+
+</div> <!-- ubuntu -->
+
+The Python package can be installed by one of the following three ways:
+
+1. Setup the environment variable `PYTHONPATH=/path/to/mxnet/python`. For example, assume `mxnet` is
+   placed on the home directory, then add the following line in the `rc` file
+   (e.g. `~/.bashrc`):
+
+   ```bash
+   export PYTHONPATH=${PATHPATH}:/~/mxnet/python
+   ```
+
+   It is recommend for users who may change MXNet's codes.
+
+2. Install for the current user by running
+
+   ```bash
+   cd python; python setup.py --usr
+   ```
+
+   in the `mxnet/python` directory
+
+3. Install system widely:
+
+   ```bash
+   cd python; sudo python setup.py
+   ```
+
+## Build the R package
+
+The R package requires `R` is installed.
+
+<div class="ubuntu">
+
+The following codes install the latest R on Ubuntu 14.04 (trusty) and also the libraries used
+to build other depended R packages
+
+```bash
+echo "deb http://cran.rstudio.com/bin/linux/ubuntu trusty/" >> /etc/apt/sources.list
+gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
+gpg -a --export E084DAB9 | apt-key add -
+
+apt-get update
+apt-get install -y r-base r-base-dev libxml2-dev libxt-dev libssl-dev
+```
+
+</div>
+
+Next install required R packages:
+
+```bash
+cd R-package
+Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
+Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
+```
+
+Once finished, build and install the R package:
+
+```bash
+cd ..
+make rpkg
+R CMD INSTALL mxnet_current_r.tar.gz
+```
+
+## Build the Scala package
+
+Both JDK and Maven are required to build the Scala package.
+
+<div class="ubuntu">
+
+```bash
+sudo apt-get install -y maven default-jdk
+```
+
+</div>
+
+The following command build the `.jar` package:
+
+```bash
+make scalapkg
+```
+
+which can be found by `ls scala-package/assembly/*/target/*SNAPSHOT.jar`.
+
+Optionally, we can install Scala for the interactive interface.
+
+<div class="ubuntu">
+
+```bash
+wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
+dpkg -i scala-2.11.8.deb
+rm scala-2.11.8.deb
+```
+
+</div>
+
+Then we can start `scala` with `mxnet` imported by
+
+```bash
+scala -cp scala-package/assembly/*/target/*SNAPSHOT.jar
+```
+
+## Build the Julia package
+
+We need to first install Julia.
+
+<div class="ubuntu centos linux">
+
+The following commands install Julia 0.5.1
+
+```bash
+wget -q https://julialang.s3.amazonaws.com/bin/linux/x64/0.5/julia-0.5.1-linux-x86_64.tar.gz
+tar -zxf julia-0.5.1-linux-x86_64.tar.gz
+rm julia-0.5.1-linux-x86_64.tar.gz
+ln -s $(pwd)/julia-6445c82d00/bin/julia /usr/bin/julia
+```
+
+</div>
+
+Next set the environment variable `MXNET_HOME=/path/to/mxnet` so that Julia
+can find the pre-built library.
+
+Then install the Julia package by using:
+
+```bash
+julia -e 'Pkg.add("MXNet")'
+```
diff --git a/docs/get_started/centos_setup.md b/docs/get_started/centos_setup.md
index 109131b361ea..f14ecadbe3fa 100644
--- a/docs/get_started/centos_setup.md
+++ b/docs/get_started/centos_setup.md
@@ -1,5 +1,5 @@
 # Installing MXNet on CentOS
-MXNet currently supports Python, R, Julia, and Scala. For users on CentOS with Docker environment, MXNet provides [Docker installation guide](http://mxnet.io/get_started/docker_setup.html). If you do not have a Docker environment set up, follow below-provided step by step instructions.
+MXNet currently supports Python, R, Julia, Scala, and Perl. For users on CentOS with Docker environment, MXNet provides [Docker installation guide](http://mxnet.io/get_started/docker_setup.html). If you do not have a Docker environment set up, follow below-provided step by step instructions.
 
 
 ## Minimum Requirements
@@ -141,11 +141,12 @@ If you don't get an import error, then MXNet is ready for python.
 
 Note: You can update mxnet for python by repeating this step after re-building `libmxnet.so`.
 
-### Install MXNet for R, Julia and Scala
+### Install MXNet for R, Julia, Scala, and Perl.
 
 - [R](http://mxnet.io/get_started/amazonlinux_setup.html#install-the-mxnet-package-for-r)
 - [Julia](http://mxnet.io/get_started/amazonlinux_setup.html#install-the-mxnet-package-for-julia)
 - [Scala](http://mxnet.io/get_started/amazonlinux_setup.html#install-the-mxnet-package-for-scala)
+- [Perl](http://mxnet.io/get_started/amazonlinux_setup.html#install-the-mxnet-package-for-perl)
 
 ## Troubleshooting
 
diff --git a/docs/get_started/index.md b/docs/get_started/index.md
index 8c4e00402227..bdd179aa0a9c 100644
--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
@@ -1,111 +1,250 @@
 # MXNet: A Scalable Deep Learning Framework
-MXNet is an open-source deep learning framework that allows you to define, train, and deploy deep neural networks on a wide array of devices, from cloud infrastructure to mobile devices.
-It is highly scalable, allowing for fast model training, and supports a flexible programming model and multiple languages. MXNet allows you to mix symbolic and imperative programming flavors to maximize both efficiency and productivity.
-MXNet is built on a dynamic dependency scheduler that automatically parallelizes both symbolic and imperative operations on the fly.
-A graph optimization layer on top of that makes symbolic execution fast and memory efficient. The MXNet library is portable and lightweight, and it scales to multiple GPUs and multiple machines.
 
+MXNet is an open-source deep learning framework that allows you to define,
+train, and deploy deep neural networks on a wide array of devices, from cloud
+infrastructure to mobile devices.  It is highly scalable, allowing for fast
+model training, and supports a flexible programming model and multiple
+languages. MXNet allows you to mix symbolic and imperative programming flavors
+to maximize both efficiency and productivity.  MXNet is built on a dynamic
+dependency scheduler that automatically parallelizes both symbolic and
+imperative operations on the fly.  A graph optimization layer on top of that
+makes symbolic execution fast and memory efficient. The MXNet library is
+portable and lightweight, and it scales to multiple GPUs and multiple machines.
+
+Please choice the programming language for the rest of this document.
+
+<div class="btn-group opt-group" role="group">
+<button type="button" class="btn btn-default opt active">Python</button>
+<button type="button" class="btn btn-default opt">R</button>
+<button type="button" class="btn btn-default opt">Scala</button>
+<button type="button" class="btn btn-default opt">Julia</button>
+<button type="button" class="btn btn-default opt">Perl</button>
+</div>
+<script type="text/javascript" src='../../_static/js/options.js'></script>
+
+## Installation
+
+<div class="btn-group opt-group" role="group">
+<button type="button" class="btn btn-default opt active">Pre-built Binaries</button>
+<button type="button" class="btn btn-default opt">Docker</button>
+<button type="button" class="btn btn-default opt">Cloud</button>
+<button type="button" class="btn btn-default opt">Build From Source</button>
+</div> <!-- opt-group -->
+
+<div class="pre-built-binaries">
+
+<div class="r scala julia perl">
+Pre-built binaries will be available soon.
+</div>
+
+<div class="python">
+
+Installing the pre-build python package requires a recent version of `pip`,
+which, for example, can be installed by
+
+```bash
+wget https://bootstrap.pypa.io/get-pip.py && sudo python get-pip.py
+```
 
-# Setup and Installation
-You can run MXNet on Amazon Linux, Ubuntu/Debian, OS X, and Windows operating systems. MXNet can be run on Docker and on Cloud like AWS. MXNet can also be run on embedded devices, such as the Raspberry Pi running Raspbian. MXNet currently supports the Python, R, Julia and Scala languages.
+<div class="btn-group opt-group" role="group">
+<button type="button" class="btn btn-default opt active">Linux</button>
+<button type="button" class="btn btn-default opt">macOS</button>
+<button type="button" class="btn btn-default opt">Windows</button>
+</div> <!-- opt-group -->
 
-If you are running Python/R on Amazon Linux or Ubuntu, you can use Git Bash scripts to quickly install the MXNet libraries and all its dependencies.
+<div class="windows">
 
-Refer below for more details on setting up MXNet:
-* [Prerequisites for using MXNet](http://mxnet.io/get_started/setup.html#prerequisites)
-* [Step by step instruction guide for installing MXNet](http://mxnet.io/get_started/setup.html#overview)
-* [Common installation problems](http://mxnet.io/get_started/setup.html#common-installation-problems)
+Will be available soon.
 
-# Start to use MXNet
+</div> <!-- windows -->
 
-While installation for  MXNet and language package is completed, we can run following codes to verify our installation is successful.
+<div class="macos">
 
-## Julia
-```julia
-julia> using MXNet
+Install by:
 
-julia> a = mx.ones((2,3), mx.gpu())
-mx.NDArray{Float32}(2,3)
+```bash
+pip install mxnet
+```
 
-julia> Array{Float32}(a * 2)
-2×3 Array{Float32,2}:
- 2.0  2.0  2.0
- 2.0  2.0  2.0
+</div> <!-- macos -->
+
+<div class="linux">
+
+Use one of following commands to install the desired release:
+
+```bash
+pip install mxnet       # CPU
+pip install mxnet-mkl   # CPU with MKL-DNN acceleration
+pip install mxnet-cu75  # GPU with CUDA 7.5
+pip install mxnet-cu80  # GPU with CUDA 8.0
 ```
 
-## Python
+The CUDA versions require both [CUDA](https://developer.nvidia.com/cuda-toolkit)
+  and [cuDNN](https://developer.nvidia.com/cudnn) are installed.
 
-The Python interface is similar to `numpy.NDArray`:
+</div> <!-- linux -->
 
- ```python
-    >>> import mxnet as mx
-    >>> a = mx.nd.ones((2, 3), mx.gpu())
-    >>> print ((a * 2).asnumpy())
-    [[ 2.  2.  2.]
-     [ 2.  2.  2.]]
- ```
+</div> <!-- python -->
 
-## R
+</div> <!-- pre-build-binaries -->
 
- ```r
-    > require(mxnet)
-    Loading required package: mxnet
-    > a <- mx.nd.ones(c(2,3))
-    > a
-         [,1] [,2] [,3]
-    [1,]    1    1    1
-    [2,]    1    1    1
-    > a + 1
-         [,1] [,2] [,3]
-    [1,]    2    2    2
-    [2,]    2    2    2
- ```
+<div class="cloud">
 
-## Scala
+AWS images with MXNet installed:
 
-You can perform tensor or matrix computation in pure Scala:
+- [Deep Learning AMI for Ubuntu](https://aws.amazon.com/marketplace/pp/B06VSPXKDX)
+- [Deep Learning AMI for Amazon Linux](https://aws.amazon.com/marketplace/pp/B01M0AXXQB)
 
- ```scala
-    scala> import ml.dmlc.mxnet._
-    import ml.dmlc.mxnet._
+</div> <!-- cloud -->
 
-    scala> val arr = NDArray.ones(2, 3)
-    arr: ml.dmlc.mxnet.NDArray = ml.dmlc.mxnet.NDArray@f5e74790
+<div class="docker">
 
-    scala> arr.shape
-    res0: ml.dmlc.mxnet.Shape = (2,3)
+Pre-build docker images are available at [docker hub](https://hub.docker.com/r/mxnet/).
 
-    scala> (arr * 2).toArray
-    res2: Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
+<div class="python">
 
-    scala> (arr * 2).shape
-    res3: ml.dmlc.mxnet.Shape = (2,3)
- ```
-# Recommended Starting Tutorials
+```bash
+docker pull mxnet/python
+docker pull mxnet/python:gpu
+```
 
-* [Handwritten Digit Recognition using Convolutional Neural Networks](http://mxnet.io/tutorials/python/mnist.html) (Beginner)
-* [Character-level language models using LSTMs](http://mxnet.io/tutorials/python/char_lstm.html) (Advanced)
+</div> <!-- python -->
 
+<div class="scala">
 
-# Next Steps
-* [Setup and Installation](http://mxnet.io/get_started/setup.html)
-* [Tutorials](http://mxnet.io/tutorials/index.html)
-* [How To](http://mxnet.io/how_to/index.html)
-* [Architecture](http://mxnet.io/architecture/index.html)
+```bash
+docker pull mxnet/scala
+```
+
+</div> <!-- scala -->
+
+<div class="r">
+
+```bash
+docker pull mxnet/r-lang
+docker pull mxnet/r-lang:gpu
+```
+
+</div> <!-- r -->
+
+<div class="julia">
+
+```bash
+docker pull mxnet/julia
+docker pull mxnet/julia:gpu
+```
+
+</div> <!-- julia -->
+
+Refer to [docker/](../../docker/) for more details.
+
+</div> <!-- docker -->
+
+<div class="build-from-source">
+
+Refer to the [building from source document](./build_from_source.md) for details
+on building MXNet from source codes for various platforms.
+
+</div> <!-- build-from-source -->
 
 
-# MXNet Open Source Community
+## Quick Overview
 
-**Broad Model Support** – Train and deploy the latest deep convolutional neural networks (CNNs) and long short-term memory (LSTMs) models
+MXNet provides an imperative *n*-dimensional array interface:
 
+```python
+>>> import mxnet as mx
+>>> a = mx.nd.ones((2, 3))
+>>> b = a * 2 + 1
+>>> b.asnumpy()  # print b by converting to a numpy.ndarray object
+array([[ 3.,  3.,  3.],
+       [ 3.,  3.,  3.]], dtype=float32)
+```
+
+```scala
+scala> import ml.dmlc.mxnet._
+import ml.dmlc.mxnet._
+
+scala> val arr = NDArray.ones(2, 3)
+arr: ml.dmlc.mxnet.NDArray = ml.dmlc.mxnet.NDArray@f5e74790
+
+scala> (arr * 2 + 1).toArray
+res0: Array[Float] = Array(3.0, 3.0, 3.0, 3.0, 3.0, 3.0)
+```
+
+```r
+> require(mxnet)
+Loading required package: mxnet
+> a <- mx.nd.ones(c(2,3))
+> a * 2 + 1
+     [,1] [,2] [,3]
+[1,]    3    3    3
+[2,]    3    3    3
+```
+
+```julia
+julia> using MXNet
+
+julia> a = mx.ones((2,3))
+mx.NDArray{Float32}(2,3)
+
+julia> Array{Float32}(a * 2 + 1)
+2×3 Array{Float32,2}:
+ 3.0  3.0  3.0
+ 3.0  3.0  3.0
+```
+
+```perl
+pdl> use AI::MXNet qw(mx)
+pdl> $a = mx->nd->ones([2, 3], ctx => mx->gpu())
+pdl> print (($a * 2 + 1)->aspdl)
+[
+ [3 3 3]
+ [3 3 3]
+]
+```
 
-&nbsp;
+MXNet also provides a symbolic programming interface:
 
-**Extensive Library of Reference Examples** – Build on sample tutorials (with code), such as image classification, language modeling, neural art, and speech recognition, and more.  
+```python
+>>> a = mx.sym.var('a')  # it requires the latest mxnet
+>>> b = a * 2 + 1  # b is a Symbol object
+>>> c = b.eval(a=mx.nd.ones((2,3)))
+>>> c[0].asnumpy()  # the list of outputs
+array([[ 3.,  3.,  3.],
+       [ 3.,  3.,  3.]], dtype=float32)
+```
+
+Run the above codes in GPU in straightforward:
+
+```python
+>>> a = mx.nd.ones((2, 3), mx.gpu())  # create a on GPU 0, then the result a*2+1 will sit on GPU 0 as well
+>>> c = b.eval(a=a, ctx=mx.gpu())  # feed a as the input to eval b, the result c will be also on GPU 0
+```
 
+```r
+> a <- mx.nd.ones(c(2,3), mx.gpu())
+```
 
-&nbsp;
+```julia
+julia> a = mx.ones((2,3), mx.gpu())
+```
 
-**Open and Collaborative Community** – Support and contributions from many top tier universities and industry partners
+In additional, MXNet provides a large number of neural network layers and
+training modules to facilitate developing deep learning algorithms.
+
+```python
+>>> data = mx.sym.var('data')
+>>> fc1  = mx.sym.FullyConnected(data, num_hidden=128)
+>>> act1 = mx.sym.Activation(fc1, act_type="relu")
+>>> fc2  = mx.sym.FullyConnected(act1, num_hidden=10)
+>>> loss  = mx.sym.SoftmaxOutput(fc2)
+>>> mod = mx.mod.Module(loss)
+>>> mod.fit(train_data, ctx=[mx.gpu(0), mx.gpu(1)]) # fit on the training data by using 2 GPUs
+```
 
+## Next Steps
 
-&nbsp;
+* [Tutorials](http://mxnet.io/tutorials/index.html)
+* [How To](http://mxnet.io/how_to/index.html)
+* [API Documents](http://mxnet.io/api/index.html)
diff --git a/docs/get_started/index_zh.md b/docs/get_started/index_zh.md
deleted file mode 100755
index 8c84621ea767..000000000000
--- a/docs/get_started/index_zh.md
+++ /dev/null
@@ -1,107 +0,0 @@
-# MXNet: 一个可扩展的深度学习框架
-MXNet是一个开源的深度学习框架，允许你在不同类型的设备上定义、训练、部署深度神经网络，从公共云服务器到移动设备均可。它具备很高的可扩展性，允许快速模型训练，支持灵活的程序模型和不同的编程语言。MXNet为了最大化效率和生产力，允许你混合使用符号式(symbolic)和命令式(imperative)编程。MXNet建立在一个动态依赖调度器上(dynamic dependency scheduler)，它可以同时运行符号式(symbolic)和命令式(imperative)语言。在它之上是一个图优化层（graph optimization layer），可以让符号式语言执行的快而且内存效率高。MXNet库是一个便携式和轻量级的库，可以适用于GPU集群和不同的设备。
-
-# 设置和安装
-你可以在Amazon Linux, Ubuntu/Debian, OS X, and Windows操作系统上运行MXNet,同时Docker和云服务(比如AWS)也可以运行。MXNet目前支持Python, R, Julia和Scala编程语言
-
-如果你是在Amazon Linux或者Ubuntu上使用Python/R语言，那么你可以通过git、bash脚本快速安装MXNet库和所有相关依赖。
-
-参考如下条目获取配置MXNet的更详细信息：
-* [MXNet设备要求](http://mxnet.io/get_started/setup.html#prerequisites)
-* [详细安装教程](http://mxnet.io/get_started/setup.html#overview)
-* [安装常见问题](http://mxnet.io/get_started/setup.html#common-installation-problems)
-
-# 开始使用
-
-当我们安装完MXNet和语言库以后，可以通过下面的代码来检查安装是否成功
-
-## Julia
-```julia
-julia> using MXNet
-
-julia> a = mx.ones((2,3), mx.gpu())
-mx.NDArray{Float32}(2,3)
-
-julia> Array{Float32}(a * 2)
-2×3 Array{Float32,2}:
- 2.0  2.0  2.0
- 2.0  2.0  2.0
-```
-
-## Python
-
-Python接口可以`numpy.NDArray`很相近：
-
-```python
-    >>> import mxnet as mx
-    >>> a = mx.nd.ones((2, 3), mx.gpu())
-    >>> print ((a * 2).asnumpy())
-    [[ 2.  2.  2.]
-     [ 2.  2.  2.]]
-```
-
-## R
-
-```r
-    > require(mxnet)
-    Loading required package: mxnet
-    > a <- mx.nd.ones(c(2,3))
-    > a
-         [,1] [,2] [,3]
-    [1,]    1    1    1
-    [2,]    1    1    1
-    > a + 1
-         [,1] [,2] [,3]
-    [1,]    2    2    2
-    [2,]    2    2    2
-```
-
-## Scala
-
-你可以在纯Scala种执行张量(tensor)和矩阵(matrix)运算:
-
-```scala
-    scala> import ml.dmlc.mxnet._
-    import ml.dmlc.mxnet._
-
-    scala> val arr = NDArray.ones(2, 3)
-    arr: ml.dmlc.mxnet.NDArray = ml.dmlc.mxnet.NDArray@f5e74790
-
-    scala> arr.shape
-    res0: ml.dmlc.mxnet.Shape = (2,3)
-
-    scala> (arr * 2).toArray
-    res2: Array[Float] = Array(2.0, 2.0, 2.0, 2.0, 2.0, 2.0)
-
-    scala> (arr * 2).shape
-    res3: ml.dmlc.mxnet.Shape = (2,3)
-```
-# 推荐教程
-
-* [使用卷积神经网络(Convolutional Neural Networks)识别手写数字](http://mxnet.io/tutorials/python/mnist.html) (初级)
-* [使用LSTMs训练字符级(Character-level)语言模型](http://mxnet.io/tutorials/python/char_lstm.html) (高级)
-
-
-# 下一步
-* [配置和安装](http://mxnet.io/get_started/setup.html)
-* [教程](http://mxnet.io/tutorials/index.html)
-* [如何使用](http://mxnet.io/how_to/index.html)
-* [架构设计](http://mxnet.io/architecture/index.html)
-
-
-# MXNet 开源社区
-
-**广泛的模型支持** – 训练和部署最新的CNN和LSTM模型
-
-&nbsp;
-
-
-**丰富的库和参考例子** – 包含样例教程(带源码)，比如图像分类，语言建模，神经艺术(neural art),语音识别等等。
-
-&nbsp;
-
-
-**开放协同的社区** – 顶级大学和商业伙伴的支持和贡献
-
-
-&nbsp;
diff --git a/docs/get_started/osx_setup.md b/docs/get_started/osx_setup.md
index 883546a644b3..f64ced09c515 100644
--- a/docs/get_started/osx_setup.md
+++ b/docs/get_started/osx_setup.md
@@ -1,5 +1,5 @@
 # Installing MXNet on OS X (Mac)
-MXNet currently supports Python, R, Julia, and Scala. For users of Python on Mac, MXNet provides a set of Git Bash scripts that installs all of the required MXNet dependencies and the MXNet library.
+MXNet currently supports Python, R, Julia, Scala and Perl. For users of Python on Mac, MXNet provides a set of Git Bash scripts that installs all of the required MXNet dependencies and the MXNet library.
 
 ## Prepare Environment for GPU Installation
 
@@ -130,6 +130,7 @@ We have installed MXNet core library. Next, we will install MXNet interface pack
 - [R](#install-the-mxnet-package-for-r)
 - [Julia](#install-the-mxnet-package-for-julia)
 - [Scala](#install-the-mxnet-package-for-scala)
+- [Perl](#install-the-mxnet-package-for-perl)
 
 ### Install the MXNet Package for Python
 Next, we install Python interface for MXNet. Assuming you are in `~/mxnet` directory, run below commands.
@@ -228,6 +229,39 @@ To install the MXNet Scala package into your local Maven repository, run the fol
     make scalainstall
 ```
 
+### Install the MXNet Package for Perl
+Before you build MXNet for Perl from source code, you must complete [building the shared library](#build-the-shared-library).
+After you build the shared library, run the following command from the MXNet source root directory to build the MXNet Perl package:
+
+```bash
+    brew install swig
+    sudo sh -c 'curl -L https://cpanmin.us | perl - App::cpanminus'
+    sudo cpanm -q -n PDL Mouse Function::Parameters
+
+    MXNET_HOME=${PWD}
+    export PERL5LIB=${HOME}/perl5/lib/perl5
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make
+    install_name_tool -change lib/libmxnet.so \
+        ${MXNET_HOME}/lib/libmxnet.so \
+        blib/arch/auto/AI/MXNetCAPI/MXNetCAPI.bundle
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make
+    install_name_tool -change lib/libmxnet.so \
+            ${MXNET_HOME}/lib/libmxnet.so \
+            blib/arch/auto/AI/NNVMCAPI/NNVMCAPI.bundle
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNet/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+```
+
 ## Next Steps
 
 * [Tutorials](http://mxnet.io/tutorials/index.html)
diff --git a/docs/get_started/setup.md b/docs/get_started/setup.md
index b50b4c5083fe..e86336fbe292 100644
--- a/docs/get_started/setup.md
+++ b/docs/get_started/setup.md
@@ -1,6 +1,6 @@
 # Overview
 
-You can run MXNet on Amazon Linux, Ubuntu/Debian, OS X, and Windows operating systems. MXNet can also be run on Docker and on Cloud like AWS. MXNet currently supports Python, R, Julia, and Scala programming languages.
+You can run MXNet on Amazon Linux, Ubuntu/Debian, OS X, and Windows operating systems. MXNet can also be run on Docker and on Cloud like AWS. MXNet currently supports Python, R, Julia, Scala, and Perl programming languages.
 
 Step by step instructions for setting up MXNet:
 
diff --git a/docs/get_started/ubuntu_setup.md b/docs/get_started/ubuntu_setup.md
index 2e3cb90da41b..8eaed0abf742 100644
--- a/docs/get_started/ubuntu_setup.md
+++ b/docs/get_started/ubuntu_setup.md
@@ -1,5 +1,5 @@
 # Installing MXNet on Ubuntu
-MXNet currently supports Python, R, Julia, and Scala. For users of Python and R on Ubuntu operating systems, MXNet provides a set of Git Bash scripts that installs all of the required MXNet dependencies and the MXNet library.
+MXNet currently supports Python, R, Julia, Scala, and Perl. For users of Python and R on Ubuntu operating systems, MXNet provides a set of Git Bash scripts that installs all of the required MXNet dependencies and the MXNet library.
 
 The simple installation scripts set up MXNet for Python and R on computers running Ubuntu 12 or later. The scripts install MXNet in your home folder ```~/mxnet```.
 
@@ -152,6 +152,7 @@ We have installed MXNet core library. Next, we will install MXNet interface pack
 - [R](#install-the-mxnet-package-for-r)
 - [Julia](#install-the-mxnet-package-for-julia)
 - [Scala](#install-the-mxnet-package-for-scala)
+- [Perl](#install-the-mxnet-package-for-perl)
 
 ### Install the MXNet Package for Python
 Next, we install Python interface for MXNet. Assuming you are in `~/mxnet` directory, run below commands.
@@ -273,6 +274,30 @@ To install the MXNet Scala package into your local Maven repository, run the fol
 ```bash
     make scalainstall
 ```
+### Install the MXNet Package for Perl
+
+Before you build MXNet for Scala from source code, you must complete [building the shared library](#build-the-shared-library). After you build the shared library, run the following command from the MXNet source root directory to build the MXNet Scala package:
+
+```bash
+    sudo apt-get install libmouse-perl pdl cpanminus swig libgraphviz-perl
+    cpanm -q -L "${HOME}/perl5" Function::Parameters
+
+    MXNET_HOME=${PWD}
+    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
+    export PERL5LIB=${HOME}/perl5/lib/perl5
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNet/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make install
+```
 
 **Note - ** You are more than welcome to contribute easy installation scripts for other operating systems and programming languages, see [community page](http://mxnet.io/community/index.html) for contributors guidelines.
 
diff --git a/docs/how_to/caffe.md b/docs/how_to/caffe.md
index b73271785870..a9bab3cdb549 100644
--- a/docs/how_to/caffe.md
+++ b/docs/how_to/caffe.md
@@ -1,53 +1,184 @@
-# How to Use Caffe Operators in MXNet
+# How to | Convert from Caffe to MXNet
 
-[Caffe](http://caffe.berkeleyvision.org/) is a well-known and widely used deep learning framework. MXNet supports calling most Caffe operators (layers) and loss functions directly in its symbolic graph. Using your own customized Caffe layer is also effortless.
+Key topics covered include the following:
 
-MXNet also has embedded [Torch modules and its tensor mathematical functions](https://github.com/dmlc/mxnet/blob/master/docs/how_to/torch.md).
+- [Converting Caffe trained models to MXNet](#converting-caffe-trained-models-to-mxnet)
+- [Calling Caffe operators in MXNet](#calling-caffe-operators-in-mxnet)
 
-This topic explains how to:
+## Converting Caffe trained models to MXNet
 
-* Install MXNet with Caffe support
+The converting tool is available at
+[tools/caffe_converter](https://github.com/dmlc/mxnet/tree/master/tools/caffe_converter). On
+the remaining of this section, we assume we are on the `tools/caffe_converter`
+directory.
 
-* Embed Caffe operators into MXNet's symbolic graph
+### How to build
 
-## Install Caffe With MXNet
+If Caffe's python package is installed, namely we can run `import caffe` in
+python, then we are ready to go.
 
+For example, we can used
+[AWS Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B06VSPXKDX) with
+both Caffe and MXNet installed.
 
-1. Download the official Caffe repository, [BVLC/Caffe](https://github.com/BVLC/caffe).
-2. Download the [Caffe patch for the MXNet interface](https://github.com/BVLC/caffe/pull/4527.patch). Move the patch file to your Caffe root folder, and apply the patch by using `git apply patch_file_name`.
-3. Install Caffe using the [official guide](http://caffe.berkeleyvision.org/installation.html).
+Otherwise we can install the
+[Google protobuf](https://developers.google.com/protocol-buffers/?hl=en)
+compiler and its python binding. It is easier to install, but may be slower
+during running.
 
-## Compile with Caffe
+1. Install the compiler:
+  - Linux: install `protobuf-compiler` e.g. `sudo apt-get install
+    protobuf-compiler` for Ubuntu and `sudo yum install protobuf-compiler` for
+     Redhat/Fedora.
+  - Windows: Download the win32 build of
+    [protobuf](https://github.com/google/protobuf/releases). Make sure to
+    download the version that corresponds to the version of the python binding
+    on the next step. Extract to any location then add that location to your
+    `PATH`
+  - Mac OS X: `brew install protobuf`
 
+2. Install the python binding by either `conda install -c conda-forge protobuf`
+   or `pip install protobuf`.
 
-1. If you haven't already, copy `make/config.mk` (for Linux) or `make/osx.mk` (for Mac) into the MXNet root folder as `config.mk`.
-2. In the mxnet folder, open `config.mk` and uncomment the lines `CAFFE_PATH = $(HOME)/caffe` and `MXNET_PLUGINS += plugin/caffe/caffe.mk`. Modify `CAFFE_PATH` to your Caffe installation, if necessary.
-3. To build with Caffe support, run `make clean && make`.
+3. Compile Caffe proto definition. Run `make` in Linux or Mac OS X, or
+   `make_win32.bat` in Windows
 
-## Using the Caffe Operator (Layer)
-Caffe's neural network operator and loss functions are supported by MXNet through `mxnet.symbol.CaffeOp` and `mxnet.symbol.CaffeLoss`, respectively.
-For example, the following code shows a [multi-layer perceptron](https://en.wikipedia.org/wiki/Multilayer_perceptron) (MLP) network for classifying MNIST digits: [full code](https://github.com/dmlc/mxnet/blob/master/example/caffe/caffe_net.py):
+### How to use
 
-### Python
+There are three tools:
+
+- `convert_symbol.py` : convert Caffe model definition in protobuf into MXNet's
+  Symbol in JSON format.
+- `convert_model.py` : convert Caffe model parameters into MXNet's NDArray format
+- `convert_mean.py` : convert Caffe input mean file into MXNet's NDArray format
+
+In addition, there are two tools:
+- `convert_caffe_modelzoo.py` : download and convert models from Caffe model
+  zoo.
+- `test_converter.py` : test the converted models by checking the prediction
+  accuracy.
+
+## Calling Caffe operators in MXNet
+
+Besides converting Caffe models, MXNet supports calling most Caffe operators,
+including network layer, data layer, and loss function, directly. It is
+particularly useful if there are customized operators implemented in Caffe, then
+we do not need to re-implement them in MXNet.
+
+### How to install
+
+This feature requires Caffe. In particular, we need to re-compile Caffe before
+[PR #4527](https://github.com/BVLC/caffe/pull/4527) is merged into Caffe. There
+are the steps of how to rebuild Caffe:
+
+1. Download [Caffe](https://github.com/BVLC/caffe). E.g. `git clone
+   https://github.com/BVLC/caffe`
+2. Download the
+   [patch for the MXNet interface](https://github.com/BVLC/caffe/pull/4527.patch)
+   and apply to Caffe. E.g.
+   ```bash
+   cd caffe && wget https://github.com/BVLC/caffe/pull/4527.patch && git apply 4527.patch
+   ```
+3. Build and install Caffe by following the
+   [official guide](http://caffe.berkeleyvision.org/installation.html).
+
+Next we need to compile MXNet with Caffe supports
+
+1. Copy `make/config.mk` (for Linux) or `make/osx.mk`
+   (for Mac) into the MXNet root folder as `config.mk` if you have not done it yet
+2. Open the copied `config.mk` and uncomment these two lines
+   ```bash
+   CAFFE_PATH = $(HOME)/caffe
+   MXNET_PLUGINS += plugin/caffe/caffe.mk
+   ```
+   Modify `CAFFE_PATH` to your Caffe installation, if necessary.
+3. Then build with 8 threads `make clean && make -j8`.
+
+### How to use
+
+This Caffe plugin adds three components into MXNet:
+
+- `sym.CaffeOp` : Caffe neural network layer
+- `sym.CaffeLoss` : Caffe loss functions
+- `io.CaffeDataIter` : Caffe data layer
+
+#### Use `sym.CaffeOp`
+The following example shows the definition of a 10 classes multi-layer perceptron:
 
 ```Python
-data = mx.symbol.Variable('data')
-fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
-act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
-fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
-act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
-fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
-mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+data = mx.sym.Variable('data')
+fc1  = mx.sym.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
+act1 = mx.sym.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
+fc2  = mx.sym.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
+act2 = mx.sym.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
+fc3 = mx.sym.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
+mlp = mx.sym.SoftmaxOutput(data=fc3, name='softmax')
 ```
 
-Let's break it down. First, `data = mx.symbol.Variable('data')` defines a variable as a placeholder for input.
-Then, it's fed through Caffe operators with `fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")`.
+Let's break it down. First, `data = mx.sym.Variable('data')` defines a variable
+as a placeholder for input.  Then, it's fed through Caffe operators with `fc1 =
+mx.sym.CaffeOp(...)`. `CaffeOp` accepts several arguments:
+
+- The inputs to Caffe operators are named as `data_i` for *i=0, ..., num_data-1*
+- `num_data` is the number of inputs. In default it is 1, and therefore
+skipped in the above example.
+- `num_out` is the number of outputs. In default it is 1 and also skipped.
+- `num_weight` is the number of weights (`blobs_`).  Its default value is 0. We
+need to explicitly specify it for a non-zero value.
+- `prototxt` is the protobuf configuration string.
+
+#### Use `sym.CaffeLoss`
 
-The inputs to Caffe operators are named as data_i for i=0.  num_data-1 as `num_data` is the number of inputs. You can skip the argument, as the example does, if its value is 1. `num_weight` is the number of `blobs_`(weights). Its default value is 0 because many operators maintain no weight. `prototxt` is the configuration string.
+Using Caffe loss is similar.
+We can replace the MXNet loss with Caffe loss.
+We can replace
 
-To use the loss function in Caffe, replace the last line with:
+Replacing the last line of the above example with the following two lines we can
+call Caffe loss instead of MXNet loss.
 
 ```Python
-label = mx.symbol.Variable('softmax_label')
-mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
+label = mx.sym.Variable('softmax_label')
+mlp = mx.sym.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
 ```
+
+Similar to `CaffeOp`, `CaffeLoss` has arguments `num_data` (2 in default) and
+`num_out` (1 in default). But there are two differences
+
+1. Inputs are `data` and `label`. And we need to explicitly create a variable
+   placeholder for label, which is implicitly done in MXNet loss.
+2. `grad_scale` is the weight of this loss.
+
+#### Use `io.CaffeDataIter`
+
+We can also wrap a Caffe data layer into MXNet's data iterator. Below is an
+example for creating a data iterator for MNIST
+
+```python
+train = mx.io.CaffeDataIter(
+    prototxt =
+    'layer { \
+        name: "mnist" \
+        type: "Data" \
+        top: "data" \
+        top: "label" \
+        include { \
+            phase: TEST \
+        } \
+        transform_param { \
+            scale: 0.00390625 \
+        } \
+        data_param { \
+            source: "caffe/examples/mnist/mnist_test_lmdb" \
+            batch_size: 100 \
+            backend: LMDB \
+        } \
+    }',
+    flat           = flat,
+    num_examples   = 60000,
+)
+```
+
+### Put it all together
+
+The complete example is available at
+[example/caffe](https://github.com/dmlc/mxnet/blob/master/example/caffe/)
diff --git a/docs/how_to/cloud.md b/docs/how_to/cloud.md
index 22553921410c..1dc575d3f483 100644
--- a/docs/how_to/cloud.md
+++ b/docs/how_to/cloud.md
@@ -43,7 +43,7 @@ MXNet requires the following libraries:
 - `curl` and `openssl` for the ability to read/write to Amazon S3
 
 Installing `CUDA` on EC2 instances requires some effort. Caffe has a good
-[tutorial](https://github.com/BVLC/caffe/wiki/Install-Caffe-on-EC2-from-scratch-(Ubuntu,-CUDA-7,-cuDNN))
+[tutorial](https://github.com/BVLC/caffe/wiki/Install-Caffe-on-EC2-from-scratch-(Ubuntu,-CUDA-7,-cuDNN-3))
 on how to install CUDA 7.0 on Ubuntu 14.04.
 
 ***Note:*** We tried CUDA 7.5 on Nov 7,
diff --git a/docs/how_to/faq.md b/docs/how_to/faq.md
index 97414f8f8f48..b5e022bd9e86 100644
--- a/docs/how_to/faq.md
+++ b/docs/how_to/faq.md
@@ -48,10 +48,10 @@ copied_model =  mx.model.FeedForward(ctx=mx.gpu(), symbol=new_symbol,
                                      arg_params=old_arg_params, aux_params=old_aux_params,
                                      allow_extra_params=True);
 ```
-For information about copying model parameters from an existing ```old_arg_params```, see this [notebook](https://github.com/dmlc/mxnet/blob/master/example/notebooks/predict-with-pretrained-model.ipynb).
+For information about copying model parameters from an existing ```old_arg_params```, see this [notebook](https://github.com/dmlc/mxnet-notebooks/blob/master/python/how_to/predict.ipynb). More notebooks please refer to [dmlc/mxnet-notebooks](https://github.com/dmlc/mxnet-notebooks).
 
 #### How to Extract the Feature Map of a Certain Layer
-See this [notebook](https://github.com/dmlc/mxnet/blob/master/example/notebooks/predict-with-pretrained-model.ipynb).
+See this [notebook](https://github.com/dmlc/mxnet-notebooks/blob/master/python/how_to/predict.ipynb). More notebooks please refer to [dmlc/mxnet-notebooks](https://github.com/dmlc/mxnet-notebooks).
 
 
 #### What Is the Relationship Between MXNet and CXXNet, Minerva, and Purine2?
diff --git a/docs/how_to/index.md b/docs/how_to/index.md
index 80db0ffb7e90..f779b7a5f16e 100644
--- a/docs/how_to/index.md
+++ b/docs/how_to/index.md
@@ -7,14 +7,6 @@ The how_tos provide a range of information from installation, basic concepts and
 
 The following topics explain basic concepts and provide procedures for specific tasks. Some include demos complete with pre-trained models.
 
-## Setup and Installation
-You can run MXNet on Amazon Linux, Ubuntu/Debian, OS X, and Windows operating systems. MXNet can also be run on Docker and on Cloud like AWS. MXNet currently supports the Python, R, Julia and Scala languages.
-
-If you are running Python/R on Amazon Linux or Ubuntu, you can use Git Bash scripts to quickly install the MXNet libraries and all its dependencies.
-
-* [Step by step instruction guide for installing MXNet](http://mxnet.io/get_started/setup.html#overview)
-* [Common installation problems](http://mxnet.io/get_started/setup.html#common-installation-problems)
-
 ## Using Pre-trained Models
 The MXNet [Model zoo](http://mxnet.io/model_zoo/index.html) is a growing collection of pre-trained models for a variety of tasks.
 In particular, the popular task of using a ConvNet to figure out what is in an image is described in detail in the tutorial on
diff --git a/docs/how_to/multi_devices.md b/docs/how_to/multi_devices.md
index 9efd6a6a33e7..94041cfbee48 100644
--- a/docs/how_to/multi_devices.md
+++ b/docs/how_to/multi_devices.md
@@ -111,7 +111,7 @@ Next, if the mxnet folder is accessible by both machines, e.g. on a
 then we can run by
 
 ```bash
-../../tools/launch.py -n 2 --launcher ssh -H hosts python train_mnist.py --network lenet --kv-store dist_sync
+python ../../tools/launch.py -n 2 --launcher ssh -H hosts python train_mnist.py --network lenet --kv-store dist_sync
 ```
 
 Note that, besides the single machine arguments, here we
@@ -137,7 +137,7 @@ then ask `launch.py` to synchronize the current directory to all machines'
  `/tmp/mxnet` directory with `--sync-dst-dir`
 
 ```bash
-../../tools/launch.py -n 2 -H hosts --sync-dst-dir /tmp/mxnet \
+python ../../tools/launch.py -n 2 -H hosts --sync-dst-dir /tmp/mxnet \
    python train_mnist.py --network lenet --kv-store dist_sync
 ```
 
@@ -149,17 +149,17 @@ communication by the environment variable `DMLC_INTERFACE`. For example, to use
 the interface `eth0`, we can
 
 ```
-export DMLC_INTERFACE=eth0; ../../tools/launch.py ...
+export DMLC_INTERFACE=eth0; python ../../tools/launch.py ...
 ```
 
 ### Debug Connection
 
 Set`PS_VERBOSE=1` to see the debug logging, e.g
 ```
-export PS_VERBOSE=1; ../../tools/launch.py ...
+export PS_VERBOSE=1; python ../../tools/launch.py ...
 ```
 
 ### More
 
-- See more launch options by `../../tools/launch.py -h`
+- See more launch options by `python ../../tools/launch.py -h`
 - See more options of [ps-lite](http://ps-lite.readthedocs.org/en/latest/how_to.html)
diff --git a/docs/how_to/new_op.md b/docs/how_to/new_op.md
index e4c5077b8acb..d6eb1b78d592 100644
--- a/docs/how_to/new_op.md
+++ b/docs/how_to/new_op.md
@@ -13,8 +13,6 @@ Implementing an operator in Python is simple. As an example, let's create a soft
 
 ```python
 import os
-# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-os.environ["MXNET_CPU_WORKER_NTHREADS"] = "2"
 import mxnet as mx
 import numpy as np
 
@@ -80,6 +78,14 @@ def infer_shape(self, in_shape):
 ```
 The first dim of an input/output tensor is batch size. The label is a set of integers, one for each data entry, and the output has the same shape as the input. Infer_shape should always return three lists in this order: inputs, outputs, and auxiliary states (which we don't have here), even if one of them is empty.
 
+Optionally, you can also define `infer_type` to declare the input and output data type of your operator. Supported types are np.float32, np.float64, np.float16, np.uint8, np.int32.
+
+```python
+def infer_type(self, in_type):
+    dtype = in_type[0]
+    return [dtype, dtype], [dtype], []
+```
+
 Finally, define a create_operator function that will be called by the back end to create an instance of softmax:
 
 ```python
diff --git a/docs/how_to/nnpack.md b/docs/how_to/nnpack.md
index a1a42085c3ca..65a9d535da7f 100644
--- a/docs/how_to/nnpack.md
+++ b/docs/how_to/nnpack.md
@@ -2,7 +2,7 @@
 
 [NNPACK](https://github.com/Maratyszcza/NNPACK) is an acceleration package for neural network computations, which can run on x86-64, ARMv7, or ARM64 architecture cpus. it's very useful for us using NNPACK to speed up running speed when deploy the trained model on mobile device.
 
-MXNet(nnvm branch) has integrated NNPACK for forward propagation(only inference) in convolution/max-pooling/fully-connected, so you may consider using NNPACK now.
+MXNet has integrated NNPACK for forward propagation(only inference) in convolution/max-pooling/fully-connected, so you may consider using NNPACK now.
 
 
 ### Conditions
@@ -15,7 +15,7 @@ The following table will tell you which satisfaction will NNPACK work.
 |:---------      |:---------- |
 |convolution     |2d convolution `and` no-bias=False `and` dilate=(1,1) `and` num_group=1 `and` batch-size = 1 or batch-size > 1 && stride = (1,1);|
 |pooling         | max-pooling `and` kernel=(2,2) `and` stride=(2,2) `and` pooling_convention=full    |
-|fully-connected| batch-size = 2^n |
+|fully-connected| without any restrictions |
 
 ### Build/Install NNPACK with MXNet
 
diff --git a/docs/how_to/perf.md b/docs/how_to/perf.md
index f900b6f15c93..dd4343db3c08 100644
--- a/docs/how_to/perf.md
+++ b/docs/how_to/perf.md
@@ -21,8 +21,8 @@ both `USE_MKL2017 = 1` and `USE_MKL2017_EXPERIMENTAL = 1` in
 details
 
 Also setting the following two environment variables may help:
-- `KMP_AFFINITY=granularity=fine,compact,1,0` if there are two physical CPUs
-- `OMP_NUM_THREADS=vCPUs / 2` in which `vCPUs` is the number of virtual CPUs.
+- `export KMP_AFFINITY=granularity=fine,compact,1,0` if there are two physical CPUs
+- `export OMP_NUM_THREADS=vCPUs / 2` in which `vCPUs` is the number of virtual CPUs.
   For linux we can get it by `cat /proc/cpuinfo  | grep processor | wc -l`
 
 Note that MXNet treats all CPU in a single machine as a single device. So when
diff --git a/docs/how_to/recordio.md b/docs/how_to/recordio.md
new file mode 100644
index 000000000000..10ab6c71d209
--- /dev/null
+++ b/docs/how_to/recordio.md
@@ -0,0 +1,73 @@
+## Create a Dataset Using RecordIO
+
+RecordIO implements a file format for a sequence of records. We recommend storing images as records and packing them together. The benefits include:
+
+* Storing images in a compact format--e.g., JPEG, for records--greatly reduces the size of the dataset on the disk.
+* Packing data together allows continuous reading on the disk.
+* RecordIO has a simple way to partition, simplifying distributed setting. We provide an example later.
+
+We provide the [im2rec tool](https://github.com/dmlc/mxnet/blob/master/tools/im2rec.cc) so you can create an Image RecordIO dataset by yourself. The following walkthrough shows you how.
+
+### Prerequisites
+Download the data. You don't need to resize the images manually. You can use ```im2rec``` to resize them automatically. For details, see the "Extension: Using Multiple Labels for a Single Image," later in this topic.
+
+### Step 1. Make an Image List File
+After you download the data, you need to make an image list file.  The format is:
+
+```
+integer_image_index \t label_index \t path_to_image
+```
+Typically, the program takes the list of names of all of the images, shuffles them, then separates them into two lists: a training filename list and a testing filename list. Write the list in the right format.
+
+This is an example file:
+
+```bash
+95099  464     n04467665_17283.JPEG
+10025081        412     ILSVRC2010_val_00025082.JPEG
+74181   789     n01915811_2739.JPEG
+10035553        859     ILSVRC2010_val_00035554.JPEG
+10048727        929     ILSVRC2010_val_00048728.JPEG
+94028   924     n01980166_4956.JPEG
+1080682 650     n11807979_571.JPEG
+972457  633     n07723039_1627.JPEG
+7534    11      n01630670_4486.JPEG
+1191261 249     n12407079_5106.JPEG
+```
+
+### Step 2. Create the Binary File
+To generate a binary image, use `im2rec` in the tool folder. `im2rec` takes the path of the `_image list file_` you generated, the `_root path_` of the images, and the `_output file path_` as input. This process usually takes several hours, so be patient.
+
+Sample command:
+
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256
+```
+For more details, run ```./bin/im2rec```.
+
+### Extension: Multiple Labels for a Single Image
+
+The `im2rec` tool and `mx.io.ImageRecordIter` have multi-label support for a single image.
+For example, if you have four labels for a single image, you can use the following procedure to use the RecordIO tools.
+
+1. Write the image list files as follows:
+
+```
+integer_image_index \t label_1 \t label_2 \t   label_3 \t label_4 \t path_to_image
+```
+
+2. Run `im2rec`, adding a 'label_width=4' to the command argument, for example:
+
+```bash
+./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
+```
+
+3. In the iterator generation code, set `label_width=4` and `path_imglist=<<The PATH TO YOUR image.lst>>`, for example:
+
+```python
+dataiter = mx.io.ImageRecordIter(
+  path_imgrec="data/cifar/train.rec",
+  data_shape=(3,28,28),
+  path_imglist="data/cifar/image.lst",
+  label_width=4
+)
+```
diff --git a/docs/index.md b/docs/index.md
index 568d94d7d8b9..e6945aa83900 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -7,6 +7,7 @@ These are used to generate the indexes for search functionality.
 - [Julia Documents](api/julia/index.md)
 - [C++ Documents](api/c++/index.md)
 - [Scala Documents](api/scala/index.md)
+- [Perl Documents](api/perl/index.md)
 - [HowTo Documents](how_to/index.md)
 - [Get Started Documents](get_started/index.md)
 - [System Documents](architecture/index.md)
diff --git a/docs/packages/perl/index.rst b/docs/packages/perl/index.rst
new file mode 100644
index 000000000000..70e2558fe2bc
--- /dev/null
+++ b/docs/packages/perl/index.rst
@@ -0,0 +1,2 @@
+.. meta::
+    :http-equiv=refresh: 0;URL='http://mxnet.io/api/perl/index.html'
diff --git a/docs/sphinx_util.py b/docs/sphinx_util.py
index d93514e88654..68c5790d39f6 100644
--- a/docs/sphinx_util.py
+++ b/docs/sphinx_util.py
@@ -1,15 +1,17 @@
-# -*- coding: utf-8 -*-
 """Helper utilty function for customization."""
 import sys
 import os
 import docutils
 import subprocess
 
+_DEV_MODE = int(os.getenv('DEV', '0'))
+
 def run_build_mxnet(folder):
     """Run the doxygen make command in the designated folder."""
     try:
         subprocess.call('cd %s; cp make/readthedocs.mk config.mk' % folder, shell = True)
-        subprocess.call('cd %s; rm -rf build' % folder, shell = True)
+        if not _DEV_MODE:       # do an incremental build for dev mode
+            subprocess.call('cd %s; rm -rf build' % folder, shell = True)
         retcode = subprocess.call("cd %s; make -j$(nproc)" % folder, shell = True)
         if retcode < 0:
             sys.stderr.write("build terminated by signal %s" % (-retcode))
@@ -99,14 +101,15 @@ def convert_md_table(root_path):
             with codecs.open(f, 'w', 'utf-8') as i:
                 i.write(output)
 
-subprocess.call('./build-notebooks.sh')
-
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 root_path = os.path.join(curr_path, '..')
-convert_md_table(curr_path)
 run_build_mxnet(root_path)
-build_r_docs(root_path)
-build_scala_docs(root_path)
+
+if not _DEV_MODE:
+    subprocess.call('./build-notebooks.sh')
+    convert_md_table(curr_path)
+    build_r_docs(root_path)
+    build_scala_docs(root_path)
 
 if not os.path.exists('../recommonmark'):
     subprocess.call('cd ..; rm -rf recommonmark;' +
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index f3eaf6411c57..e9cc8cdd88ab 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -134,6 +134,9 @@ The following tutorials explain how to develop applications that use machine lea
 - [Classify Real-World Images with Pre-trained Model](http://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
 *An advanced example of using a large pre-trained model to classify the ImageNet dataset*
 
+- [Dogs vs. Cats classification with mxnet and R](https://statist-bhfz.github.io/cats_dogs_finetune) *End-to-end tutorial with an example of fine-tuning in R*
+([source RMD](https://github.com/dmlc/mxnet/tree/master/docs/tutorials/r/CatsDogsFinetune.rmd))
+
 
 #### Natural Language Processing
 
@@ -152,6 +155,11 @@ Applications that use traditional methods to model classification and regression
 
 ### Scala
 
+#### Get Started
+- [MXNet Scala from IntelliJ](http://mxnet.io/tutorials/scala/mxnet_scala_on_intellij.html)
+*How to create MXNet Scala examples or applications with the IntelliJ IDE.*
+
+#### Applications
 - [Handwritten Digit Classification](http://mxnet.io/tutorials/scala/mnist.html)
 *A simple example of classifying handwritten digits from the MNIST dataset using a multilayer perceptron.*
 
diff --git a/docs/tutorials/python/symbol.md b/docs/tutorials/python/symbol.md
index 64fc9ff00802..febcb38e1ad9 100644
--- a/docs/tutorials/python/symbol.md
+++ b/docs/tutorials/python/symbol.md
@@ -84,6 +84,20 @@ In the preceding example, *net* is used as a function to apply to an existing sy
 *net*, and the resulting *composed_net* will replace the original argument *data* with
 *net2*.
 
+Once you start building some bigger networks, you might want to name some symbols with a common prefix to outline the structure of your network. You can use the [Prefix](https://github.com/dmlc/mxnet/blob/master/python/mxnet/name.py) NameManager as follow:
+
+```python
+   >>> data = mx.sym.Variable("data")
+   >>> net = data
+   >>> n_layer = 2
+   >>> for i in range(n_layer):
+   ...     with mx.name.Prefix("layer%d_" % (i + 1)):
+   ...         net = mx.sym.FullyConnected(data=net, name="fc", num_hidden=100)
+   ...
+   >>> net.list_arguments()
+   ['data', 'layer1_fc_weight', 'layer1_fc_bias', 'layer2_fc_weight', 'layer2_fc_bias']
+```
+
 ## Argument Shape Inference
 
 Now we know how to define a symbol. Next, we can infer the shapes of
diff --git a/docs/tutorials/r/CatsDogsFinetune.rmd b/docs/tutorials/r/CatsDogsFinetune.rmd
new file mode 100644
index 000000000000..a99e7042804e
--- /dev/null
+++ b/docs/tutorials/r/CatsDogsFinetune.rmd
@@ -0,0 +1,305 @@
+---
+title: "Dogs vs. Cats classification with mxnet and R"
+author: "Andrey Ogurtsov (https://github.com/statist-bhfz/)"
+date: "February 25, 2017"
+---
+
+## 1. Packages and prerequisites
+
+Ubuntu 16, **mxnet** 0.9.4 (compiled with GPU support), **imager** for image processind, **abind** for manipulations with arrays. It is almost end-to-end R solution for Kaggle competition https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/, we will use Python only for creating .rec-files.
+
+Thanks to [jeremiedb](https://github.com/jeremiedb), my code for fine-tuning is largely based on his [answers](https://github.com/dmlc/mxnet/issues/4817).
+
+```{r}
+knitr::opts_chunk$set(eval = FALSE)
+```
+
+```{r}
+library(imager)
+library(mxnet)
+library(abind)
+```
+
+
+## 2. Image processing
+
+### 2.1. Renaming train files
+
+```{r}
+files <- list.files("train")
+old_names <- sapply(files, strsplit, split = ".", fixed = TRUE)
+max_length <- max(sapply(old_names, function(x) nchar(x[[2]])))
+zeros <- max_length - sapply(old_names, function(x) nchar(x[[2]]))
+zeros <- sapply(zeros, function(x) paste(rep(0, x), collapse = ""))
+new_names <- Map(function(x, y) {paste0("./train/", 
+                                        x[1], 
+                                        "/",
+                                        y,
+                                        x[2],
+                                        ".jpg")},
+                 x = old_names, y = zeros
+                 )
+
+# Full names
+files <- paste0("./train/", files)
+
+dir.create("./train/cat")
+dir.create("./train/dog")
+
+# New names will be in 00001.jpg format
+Map(function(x, y) file.rename(from = x, to = y), files, new_names)
+```
+
+### 2.2. Train images: 224x224, padded with empty space
+
+```{r}
+files <- list.files("train", recursive = TRUE)
+new_names <- paste0("train_pad_224x224/", files)
+files <- paste0("./train/", files)
+dir.create("./train_pad_224x224/")
+dir.create("./train_pad_224x224/cat")
+dir.create("./train_pad_224x224/dog")
+
+padImage <- function(x) {
+    long_side <- max(dim(x)[1:2])
+    short_side <- min(dim(x)[1:2])
+    pad_img <- pad(x, 
+                   nPix = long_side - short_side,
+                   axes = ifelse(dim(x)[1] < dim(x)[2], "x", "y"))
+    return(pad_img)
+}
+
+Map(function(x, y) {
+    pad_img <- padImage(load.image(x))
+    res_img <- resize(pad_img,  size_x = 224, size_y = 224)
+    imager::save.image(res_img, y)
+}, x = files, y = new_names)
+```
+
+### 2.3. Renaming test files
+
+```{r}
+files <- list.files("test")
+max_length <- max(sapply(files, nchar))
+zeros <- max_length - sapply(files, nchar)
+zeros <- sapply(zeros, function(x) paste(rep(0, x), collapse = ""))
+newnames <- paste0("./test/", zeros, files)
+
+files <- paste0("./test/", files)
+
+Map(function(x, y) file.rename(from = x, to = y), files, newnames)
+```
+
+
+### 2.4. Test images: 224x224, padded with empty space
+
+```{r}
+files <- list.files("test")
+new_names <- paste0("test_pad_224x224/", files)
+files <- paste0("./test/", files)
+dir.create("./test_pad_224x224/")
+
+Map(function(x, y) {
+    pad_img <- padImage(load.image(x))
+    res_img <- resize(pad_img,  size_x = 224, size_y = 224)
+    imager::save.image(res_img, y)
+}, x = files, y = new_names)
+```
+
+### 2.5. Creating .rec files
+
+```{bash, eval = FALSE}
+python ~/mxnet/tools/im2rec.py --list=1 --recursive=1 --train-ratio=0.8 cats_dogs train_pad_224x224
+python ~/mxnet/tools/im2rec.py --num-thread=4 --pass-through=1 cats_dogs_train.lst train_pad_224x224
+python ~/mxnet/tools/im2rec.py --num-thread=4 --pass-through=1 cats_dogs_val.lst train_pad_224x224
+```
+
+
+## 3. Iterators
+
+```{r}
+get_iterator <- function(data_shape, 
+						 train_data, 
+						 val_data, 
+						 batch_size = 128) {
+    train <- mx.io.ImageRecordIter(
+    	path.imgrec = train_data,
+    	batch.size  = batch_size,
+        data.shape  = data_shape,
+    	rand.crop   = TRUE,
+    	rand.mirror = TRUE)
+  
+    val <- mx.io.ImageRecordIter(
+    	path.imgrec = val_data,
+    	batch.size  = batch_size,
+    	data.shape  = data_shape,
+    	rand.crop   = FALSE,
+    	rand.mirror = FALSE
+    	)
+ 
+  return(list(train = train, val = val))
+}
+```
+
+
+```{r}
+data  <- get_iterator(data_shape = c(224, 224, 3),
+         train_data = "/media/andrey/Data/KAGGLE/cats_dogs/cats_dogs_train.rec",
+         val_data   = "/media/andrey/Data/KAGGLE/cats_dogs/cats_dogs_val.rec",
+         batch_size = 8)
+train <- data$train
+val   <- data$val
+```
+
+
+## 4. Load pretrained model
+
+Model from http://data.dmlc.ml/models/imagenet/
+Last fully connected layes for 1000 classes replaced with new layer for 2 classes.
+
+
+```{r}
+inception_bn <- mx.model.load("models/inception_bn/Inception-BN", 
+							  iteration = 126)
+
+symbol <- inception_bn$symbol
+# check symbol$arguments for layer names
+internals <- symbol$get.internals()
+outputs <- internals$outputs
+
+flatten <- internals$get.output(which(outputs == "flatten_output"))
+
+new_fc <- mx.symbol.FullyConnected(data = flatten, 
+                                   num_hidden = 2, 
+                                   name = "fc1") 
+                        # set name to original name in symbol$arguments
+new_soft <- mx.symbol.SoftmaxOutput(data = new_fc, 
+                                    name = "softmax")
+                        # set name to original name in symbol$arguments
+
+arg_params_new <- mxnet:::mx.model.init.params(
+	symbol = new_soft, 
+    input.shape = c(224, 224, 3, 8), 
+    initializer = mxnet:::mx.init.uniform(0.1), 
+    ctx = mx.gpu(0)
+	)$arg.params
+fc1_weights_new <- arg_params_new[["fc1_weight"]]
+fc1_bias_new <- arg_params_new[["fc1_bias"]]
+
+arg_params_new <- inception_bn$arg.params
+
+arg_params_new[["fc1_weight"]] <- fc1_weights_new 
+arg_params_new[["fc1_bias"]] <- fc1_bias_new 
+```
+
+
+## 5. Train (fine-tune) model
+
+```{r}
+model <- mx.model.FeedForward.create(
+  symbol             = new_soft,
+  X                  = train,
+  eval.data          = val,
+  ctx                = mx.gpu(0),
+  eval.metric        = mx.metric.accuracy,
+  num.round          = 1,
+  learning.rate      = 0.05,
+  momentum           = 0.9,
+  wd                 = 0.00001,
+  kvstore            = "local",
+  array.batch.size   = 128,
+  epoch.end.callback = mx.callback.save.checkpoint("inception_bn"),
+  batch.end.callback = mx.callback.log.train.metric(150),
+  initializer        = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
+  optimizer          = "sgd",
+  arg.params         = arg_params_new,
+  aux.params         = inception_bn$aux.params
+)
+```
+
+```{r}
+model <- mx.model.load("inception_bn", 1)
+```
+
+Continue training with decreased speed (`learning.rate = 0.03`):
+
+```{r}
+model <- mx.model.FeedForward.create(
+  symbol             = model$symbol,
+  X                  = train,
+  eval.data          = val,
+  ctx                = mx.gpu(0),
+  eval.metric        = mx.metric.accuracy,
+  num.round          = 5,
+  learning.rate      = 0.03,
+  momentum           = 0.9,
+  wd                 = 0.00001,
+  kvstore            = "local",
+  array.batch.size   = 100,
+  epoch.end.callback = mx.callback.save.checkpoint("inception_bn"),
+  batch.end.callback = mx.callback.log.train.metric(150),
+  initializer        = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
+  optimizer          = "sgd",
+  arg.params         = model$arg.params, 
+  aux.params         = model$aux.params
+)
+```
+
+```{r}
+model <- mx.model.load("inception_bn", 1)
+```
+
+My R session crashed after each iteration, so I made some iterations manually.
+
+
+## 6. Make predictions
+
+```{r}
+preprocImage<- function(src,              # URL or file location
+						height = 224,        
+						width = 224,  
+						num_channels = 3, # 3 for RGB, 1 for grayscale
+						mult_by = 1,      # set to 255 for normalized image
+						crop = FALSE) {   # no crop by default
+	
+	im <- load.image(src)
+	
+	if (crop) {
+		shape <- dim(im)
+        short_edge <- min(shape[1:2])
+		xx <- floor((shape[1] - short_edge) / 2)
+        yy <- floor((shape[2] - short_edge) / 2) 
+        im <- crop.borders(im, xx, yy)
+	}
+	
+	resized <- resize(im,  size_x = width, size_y = height)
+	arr <- as.array(resized) * mult_by
+	dim(arr) <- c(width, height, num_channels, 1)
+	return(arr)
+} 
+```
+
+```{r}
+files <- list.files("test_pad_224x224/")
+files <- paste0("./test_pad_224x224/", files)
+
+# ind <- seq(1, 12500, 1250) 
+# probs <- numeric()
+# for (i in ind) {
+#    images <- lapply(files[i:i+1249], preprocImage, mult_by = 255)
+#    images <- do.call(abind, images)
+#    probs[i:i+1249] <- predict(model, X = images, ctx = mx.gpu(0))
+# }
+
+files <- split(files, rep(1:1250, each = 10))
+probs <- lapply(files, function(x) {
+    images <- lapply(x, preprocImage, mult_by = 255)
+    images <- do.call(abind, images)
+    probs <- predict(model, X = images, ctx = mx.gpu(0))
+})
+saveRDS(probs, "probs.rds")
+probs <- t(do.call(cbind, probs))
+
+preds <- data.frame(id = 1:12500, label = probs[, 2])
+write.csv(preds, "subm.csv", row.names = FALSE, quote = FALSE)
+```
diff --git a/docs/tutorials/scala/mxnet_scala_on_intellij.md b/docs/tutorials/scala/mxnet_scala_on_intellij.md
new file mode 100644
index 000000000000..025874bac6b8
--- /dev/null
+++ b/docs/tutorials/scala/mxnet_scala_on_intellij.md
@@ -0,0 +1,62 @@
+# Run MXNet Scala Examples Using the IntelliJ IDE
+
+This tutorial guides you through setting up a Scala project in the IntelliJ IDE and shows how to use an MXNet package from your application.
+
+## Prerequisites:
+To use this tutorial, you need:
+
+- [Maven 3](https://maven.apache.org/install.html).
+- [Scala 2.11.8](https://www.scala-lang.org/download/2.11.8.html).
+- MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/get_started/setup.html#overview).
+- The MXNet package for Scala. For installation instructions, see [this procedure](http://mxnet.io/get_started/osx_setup.html#install-the-mxnet-package-for-scala).
+- [IntelliJ IDE](https://www.jetbrains.com/idea/).
+
+## Set Up Your Project
+
+- Install the plugin for IntelliJ IDE by following these steps:
+ On **Menu**, choose **Preferences**, choose **Plugins**, type **Scala**, and then choose **Install**.
+
+- Follow the instructions for [Scala plugin setup for IDE](https://www.jetbrains.com/help/idea/2016.3/scala.html).
+
+- When you build the MXNet package with Scala, a JAR file called `mxnet-full_${scala.binary.version}-${platform}` is generated in `native/<your-architecture>/target` directory. You need this file to create an example package that has a dependency on MXNet.
+
+- Specify project dependencies in pom.xml:
+
+```HTML
+    <dependencies>
+      <dependency>
+        <groupId>ml.dmlc.mxnet</groupId>
+        <artifactId>mxnet-full_${scala.binary.version}-${platform}</artifactId>
+        <version>0.1.1</version>
+        <scope>system</scope>
+        <systemPath>`MXNet-Scala-jar-path`</systemPath>
+      </dependency>
+      <dependency>
+        <groupId>args4j</groupId>
+        <artifactId>args4j</artifactId>
+        <version>2.0.29</version>
+      </dependency>
+    </dependencies>
+```
+
+Be sure to change the system path of MXNet-Scala-jar, which is in the `native/<your-architecture>/target` directory.
+
+- Choose the example project, choose Maven, and then reimport. These steps add all of the dependencies in pom.xml as external libraries in your project.
+
+- To build the project, choose Menu, choose Build, and then choose Rebuild Project. If errors are reported in the IDE, address them.
+
+- You can also compile the project by using the following command at the command line.
+
+```bash
+    cd mxnet-scala-example
+    mvn clean package
+```
+
+- This also generates a file called mxnet-scala-example-0.1-SNAPSHOT.jar for your application.
+
+## Next Steps
+For more information about MXNet Scala resources, see the following:
+
+* [Scala API](http://mxnet.io/api/scala/)
+* [More Scala Examples](https://github.com/dmlc/mxnet/tree/master/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples)
+* [MXNet tutorials index](http://mxnet.io/tutorials/index.html)
diff --git a/docs/zh/api/python/index.md b/docs/zh/api/python/index.md
deleted file mode 100644
index 0a539eb6bb36..000000000000
--- a/docs/zh/api/python/index.md
+++ /dev/null
@@ -1,26 +0,0 @@
-MXNet Python Package
-====================
-
-这个页面包含 python 程序包中所有相关的文档.
-为了安装 python 程序包, 请 checkout [Build and Installation Instruction](../../how_to/build.md).
-
-这里有关于 mxnet 的三种文档.
-
-* [Tutorials](#tutorials)  介绍一个特定的关于 mxnet 的用例.
-* [Code Examples](../../../example) 示例代码.
-* [Python API Documents](#python-api-documents) 关于指定模块的文档, 同时也包含所有 API 的参考文档.
-
-Tutorials
----------
-* [Python Overview Tutorial](tutorial.md)
-* [Symbolic Configuration and Execution in Pictures](symbol_in_pictures.md)
-* [How to Create New Operations (Layers)](../../how_to/new_op.md)
-
-Python API Documents
---------------------
-* [High Level Model Training Related API](model.md)
-* [The Module API](module.md)
-* [NDArray API](ndarray.md)
-* [Symbolic API](symbol.md)
-* [KVStore API](kvstore.md)
-* [Data Loading API](io.md)
diff --git a/docs/zh/api/python/io.md b/docs/zh/api/python/io.md
deleted file mode 100644
index 6e361ea504ed..000000000000
--- a/docs/zh/api/python/io.md
+++ /dev/null
@@ -1,185 +0,0 @@
-MXNet Python Data Loading API
-=============================
-* [Introduction](#introduction) 介绍 MXNet 数据加载模块的主要特性.
-* [Parameters For Data Iterator](#parameters-for-data-iterator) 阐述清楚 dataIter 的参数的不同用法.
-* [Create A Data Iterator](#create-a-data-iterator) 介绍如何在创建一个  python 版本的 MXNet 的 Data Iterator.
-* [How To Get Data](#how-to-get-data) 介绍数据源以及数据预处理工具.
-* [IO API Reference](#io-api-reference) IO API 参考文档以及它们的解释.
-
-Introduction
-------------
-这页面介绍 MXNet 的数据输入方式. MXNet 使用迭代器 (iterator)的方式向神经网络输入数据. 迭代器做了一些数据预处理, 同时以 batch 的形式向神经网络提供数据.
-
-
-* 我们为 MNIST 图像和 RecordIO 图像提供了基本的迭代器.
-* 为了掩盖 IO 开销, 我们提供了预处理策略, 它可以让机器学习的过程和取数据的过程并行来做. 我们使用一个单独的线程来做取数据的工作.
-
-Parameters For Data Iterator
-----------------------------
-
-一般地讲, 如果你要创建一个数据迭代器, 你需要实现下面讲到的五种参数:
-
-* **Dataset Param** 提供数据集的基本信息, 比如说, 文件路径, 输入的数据的 shape. 
-* **Batch Param** 提供构建一个 batch 的信息,  比如说 batch size. 
-* **Augmentation Param** 指定输入数据的扩充方式 (e.g. crop, mirror).
-* **Backend Param** 控制后端线程掩盖数据加载开销的行为.
-* **Auxiliary Param** 提供的可选项, 用来帮助检查和 debug..
-
-通常地讲, **Dataset Param** 和 **Batch Param**  *必须* 提 供, 否则 data batch 无法创建. 其他的参数根据算法和性能的需要来设置.  文档的后半部分会提供解释详尽的例子.
-
-Create A Data Iterator
-----------------------
-这个 IO API 提供在 python 中创建数据迭代器的简单方式. 下面的代码是如何创建一个 Cifar 的数据迭代器的代码.
-
-
-```python
->>>dataiter = mx.io.ImageRecordIter(
->>>        # Utility Parameter 
->>>        # 可选
->>>        # Name of the data, should match the name of the data input of the network 
->>>        # data_name='data',
->>>        # Utility Parameter
->>>        # 可选
->>>        # Name of the label, should match the name of the label parameter of the network.
->>>        # Usually, if the loss layer is named 'foo', then the label input has the name
->>>        # 'foo_label', unless overwritten
->>>        # label_name='softmax_label',
->>>        # Dataset Parameter
->>>        # Impulsary
->>>        # indicating the data file, please check the data is already there
->>>        path_imgrec="data/cifar/train.rec",
->>>        # Dataset Parameter
->>>        # Impulsary
->>>        # indicating the image size after preprocessing
->>>        data_shape=(3,28,28),
->>>        # Batch Parameter
->>>        # Impulsary
->>>        # tells how many images in a batch
->>>        batch_size=100,
->>>        # Augmentation Parameter
->>>        # 可选
->>>        # when offers mean_img, each image will subtract the mean value at each pixel
->>>        mean_img="data/cifar/cifar10_mean.bin",
->>>        # Augmentation Parameter
->>>        # 可选
->>>        # randomly crop a patch of the data_shape from the original image
->>>        rand_crop=True,
->>>        # Augmentation Parameter
->>>        # Optional
->>>        # randomly mirror the image horizontally
->>>        rand_mirror=True,
->>>        # Augmentation Parameter
->>>        # Optional
->>>        # randomly shuffle the data
->>>        shuffle=False,
->>>        # Backend Parameter
->>>        # Optional
->>>        # Preprocessing thread number
->>>        preprocess_threads=4,
->>>        # Backend Parameter
->>>        # Optional
->>>        # Prefetch buffer size
->>>        prefetch_buffer=1)
-```
-
-从上面的代码中, 我们可以学到如何创建一个数据迭代器. 首先, 你需要明确的指出需要取哪种类型的数据(MNIST, ImageRecord 等等). 然后, 提供描述数据的可选参数, 比如 batching, 数据扩充方式, 多线程处理, 预取数据.  MNNet 框架会检查参数的有效性, 如果一个必须的参数没有提供, 框架会报错.
-
-
-How To Get Data
----------------
-
-
-我们提供了 [脚本](../../tests/python/common/get_data.py) 来下载MNIST数据 和Cifar10 ImageRecord 数据.  如果你要创建你自己的数据集, 我们建议您用RecordIO 作为数据格式.
-
-## Create Dataset Using RecordIO
-
-RecordIO 实现了顺序存储 record 的数据格式. 我们建议图像数据按照 record 的格式来存储和打包到一起. 这样做的有以下几点:
-
-
-* 将图像储存为压缩过的格式, 比如 JPEG, 因为 record 可以大小不同. 压缩过的格式可以极大的减小储存在硬盘上的数据集大小.
-* 将若干 record 打包存储, 可以实现硬盘的连续读取, 避免随机读取硬盘.
-* RecordIO 容易分块, 这样分布式处理的设置会更加简单. 后面会有例子具体来说明.
-
-我们提供了 [im2rec tool](../../tools/im2rec.cc) 来让用户自己来生成 RecordIO 格式的数据集.  下面是具体流程:
-
-### 0.Before you start
-确定你已经下载了需要的数据集. 你不需要自己来做图像的 resize 操作, 现在 `im2rec` 这个工具可以自动来做这种操作. 你可以查看 `im2rec` 提供的的信息来获取更多的内容.
-
-### 1.Make the image list
-当你得到了信息之后, 你首先需要生成一个 image list 的文件. 格式如下
-```
-integer_image_index \t label_index \t path_to_image
-```
-通常, 这个程序会读取一个包含所有图像文件名的列表文件,  shuffle 这些文件, 然后将 shuffle 后的图像文件名列表分为训练列表文件和测试列表文件. 按照下面给出的例子的格式存储.
-
-简单的例子文件
-
-```bash
-895099  464     n04467665_17283.JPEG
-10025081        412     ILSVRC2010_val_00025082.JPEG
-74181   789     n01915811_2739.JPEG
-10035553        859     ILSVRC2010_val_00035554.JPEG
-10048727        929     ILSVRC2010_val_00048728.JPEG
-94028   924     n01980166_4956.JPEG
-1080682 650     n11807979_571.JPEG
-972457  633     n07723039_1627.JPEG
-7534    11      n01630670_4486.JPEG
-1191261 249     n12407079_5106.JPEG
-```
-
-### 2.Make the binary file
-
-需要用 *im2rec* 这个程序来生成二进制文件.  im2rec 需要你刚刚生成的 _ image list file _ 的路径, 图像的 _root_ 路径 和 _output file_ 路径作为参数. 这个过程需要花费几个小时, 所以需要耐心. :)
-
-
-简单的例子:
-```bash
-./bin/im2rec image.lst image_root_dir output.bin resize=256
-```
-要想获得更多的用法, 直接运行 ```./bin/im2rec```命令, 会在终端打印出详细的用法.
-
-### Extension: Mutliple Labels for a Single Image
-
-`im2rec` 工具以及 `mx.io.ImageRecordIter` 支持对单个图像打多个标签. 假设你需要为单个图像打四个标签, 你可以按照下面的步骤来使用 RecordIO 相关的工具.
-
-1. 按照下面的格式生成 image list 文件:
-```
-integer_image_index \t label_1 \t label_2 \t label_3 \t label_4 \t path_to_image
-```
-
-2. 使用 `im2rec` 时, 需要增加一个 'label_width=4' 作为命令行参数, 比如.
-```bash
-./bin/im2rec image.lst image_root_dir output.bin resize=256 label_width=4
-```
-
-3. 在你的迭代器初始化的时候, 设置 `label_width=4` 和 `path_imglist=<<The PATH TO YOUR image.lst>>` 作为参数.
-
-```python
-dataiter = mx.io.ImageRecordIter(
-  path_imgrec="data/cifar/train.rec",
-  data_shape=(3,28,28),
-  path_imglist="data/cifar/image.lst",
-  label_width=4
-)
-```
-
-这样你就完成了一个多标签的数据迭代器.
-
-```eval_rst
-.. raw:: html
-
-    <script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
-```
-
-
-IO API Reference
-----------------
-
-```eval_rst
-.. automodule:: mxnet.io
-    :members:
-
-.. raw:: html
-
-    <script>auto_index("mxnet.io");</script>
-```
diff --git a/docs/zh/api/python/kvstore.md b/docs/zh/api/python/kvstore.md
deleted file mode 100644
index 03578baec254..000000000000
--- a/docs/zh/api/python/kvstore.md
+++ /dev/null
@@ -1,132 +0,0 @@
-KVStore API
-===========
-
-* [基本的 Push 和 Pull 操作](#basic-push-and-pull)
-* [key-value pairs 列表的接口](#interface-for-list-key-value-pairs)
-
-## Basic Push and Pull
-
-单机多卡的基本操作.
-
-### Initialization
-
-首先让我们来考虑一个简单的例子. 首先初始化一个 (`int`, `NDarray`) push 到 KVstore 里, 然后再将数据   pull 下来.
-
-```python
->>> kv = mx.kv.create('local') # create a local kv store.
->>> shape = (2,3)
->>> kv.init(3, mx.nd.ones(shape)*2)
->>> a = mx.nd.zeros(shape)
->>> kv.pull(3, out = a)
->>> print a.asnumpy()
-[[ 2.  2.  2.]
- [ 2.  2.  2.]]
-```
-
-### Push, Aggregation, and Updater
-
-对于任意一个被初始化的 key-value 数据, 我们可以向这个 `key` push 一个相同 shape 的数据覆盖掉原来的 value.
-
-
-```python
->>> kv.push(3, mx.nd.ones(shape)*8)
->>> kv.pull(3, out = a) # pull out the value
->>> print a.asnumpy()
-[[ 8.  8.  8.]
- [ 8.  8.  8.]]
-```
-
-需要做 push 操作的数据可以存储在任意的设备上. 而且, 我们可以向同一个 key 推送多份数据, KVStore 客户端会首先将这些数据做 sum 操作, 然后将聚合后的结果 push 到服务器端, 减少了数据通信.
-
-```python
->>> gpus = [mx.gpu(i) for i in range(4)]
->>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
->>> kv.push(3, b)
->>> kv.pull(3, out = a)
->>> print a.asnumpy()
-[[ 4.  4.  4.]
- [ 4.  4.  4.]]
-```
-
-对于每一个 push 操作, KVStore 将推送上来的数据通过 `updater` 定义的方式来进行更新操作. 默认的 `updater` 是 `ASSGIN`, 我们可以根据需要来替换掉这个默认的 `update`.
-
-```python
->>> def update(key, input, stored):
->>>     print "update on key: %d" % key
->>>     stored += input * 2
->>> kv._set_updater(update)
->>> kv.pull(3, out=a)
->>> print a.asnumpy()
-[[ 4.  4.  4.]
- [ 4.  4.  4.]]
->>> kv.push(3, mx.nd.ones(shape))
-update on key: 3
->>> kv.pull(3, out=a)
->>> print a.asnumpy()
-[[ 6.  6.  6.]
- [ 6.  6.  6.]]
-```
-
-### Pull
-
-我们已经看到如何 pull 单个的 key-value 对. 类似于 push, 我们也能只用一个调用来将数据 pull 到多个设备中.
-
-```python
->>> b = [mx.nd.ones(shape, gpu) for gpu in gpus]
->>> kv.pull(3, out = b)
->>> print b[1].asnumpy()
-[[ 6.  6.  6.]
- [ 6.  6.  6.]]
-```
-
-## Interface for list key-value pairs
-
-我们到现在为止所介绍的所有操作都是关于一个 key. KVStore 也提供了对 key-value pair 列表的接口. 
-
-针对单个的设备:
-
-```python
->>> keys = [5, 7, 9]
->>> kv.init(keys, [mx.nd.ones(shape)]*len(keys))
->>> kv.push(keys, [mx.nd.ones(shape)]*len(keys))
-update on key: 5
-update on key: 7
-update on key: 9
->>> b = [mx.nd.zeros(shape)]*len(keys)
->>> kv.pull(keys, out = b)
->>> print b[1].asnumpy()
-[[ 3.  3.  3.]
- [ 3.  3.  3.]]
-```
-
-针对多个设备:
-
-```python
->>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
->>> kv.push(keys, b)
-update on key: 5
-update on key: 7
-update on key: 9
->>> kv.pull(keys, out = b)
->>> print b[1][1].asnumpy()
-[[ 11.  11.  11.]
- [ 11.  11.  11.]]
-```
-
-```eval_rst
-.. raw:: html
-
-    <script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
-```
-
-
-## API Reference
-
-```eval_rst
-.. automodule:: mxnet.kvstore
-    :members:
-
-.. raw:: html
-
-    <script>auto_index("mxnet.kvstore");</script>
-```
diff --git a/docs/zh/api/python/ndarray.md b/docs/zh/api/python/ndarray.md
deleted file mode 100644
index 38e47ebf9742..000000000000
--- a/docs/zh/api/python/ndarray.md
+++ /dev/null
@@ -1,161 +0,0 @@
-NDArray API
-===========
-
-NDArray 程序包 (`mxnet.ndarray`) 包含类似于 `numpy.ndarray` 的 张量计算包.  它的语法很相近, 除了增加了一些处理 I/O 和多设备的调用.
-
-Create NDArray
---------------
-
-类似 `numpy`, 你可以按照下面的方式来创建 `mxnet.ndarray` :
-```python
->>> import mxnet as mx
->>> # all-zero array of dimension 100x50
->>> a = mx.nd.zeros((100, 50))
->>> # all-one array of dimension 256x32x128x1
->>> b = mx.nd.ones((256, 32, 128, 1))
->>> # initialize array with contents
->>> c = mx.nd.array([[1, 2, 3], [4, 5, 6]])
-```
-
-NDArray operations
--------------------
-
-我们提供了几个基本的 ndarray 操作, 比如说算术和切片. 更多的操作正在开发中!
-
-### 算术操作
-```python
->>> import mxnet as mx
->>> a = mx.nd.zeros((100, 50))
->>> a.shape
-(100L, 50L)
->>> b = mx.nd.ones((100, 50))
->>> # c and d will be calculated in parallel here!
->>> c = a + b
->>> d = a - b
->>> # inplace operation, b's contents will be modified, but c and d won't be affected.
->>> b += d
-```
-
-### 切片操作
-```python
->>> import mxnet as mx
->>> a = mx.nd.zeros((100, 50))
->>> a[0:10] = 1   # first 10 rows will become 1
-```
-
-Conversion from/to `numpy.ndarray`
-----------------------------------
-
-MXNet NDArray 提供了很自然的方式来支持`mxnet.ndarray` 和 `numpy.ndarray` 之间的互相转换:
-
-```python
->>> import mxnet as mx
->>> import numpy as np
->>> a = np.array([1,2,3])
->>> b = mx.nd.array(a)                  # convert from numpy array
->>> b
-<mxnet.ndarray.NDArray object at ...>
->>> b.asnumpy()                         # convert to numpy array
-array([ 1., 2., 3.], dtype=float32)
-```
-
-Save Load NDArray
------------------
-
-你可以一种使用 pickle 来保存和加载 NDArray.
-我们也提供了一些函数来简化 NDArray 的列表或者字典的加载与保存操作.
-
-```python
->>> import mxnet as mx
->>> a = mx.nd.zeros((100, 200))
->>> b = mx.nd.zeros((100, 200))
->>> # save list of NDArrays
->>> mx.nd.save("/path/to/array/file", [a, b])
->>> # save dictionary of NDArrays to AWS S3
->>> mx.nd.save("s3://path/to/s3/array", {'A' : a, 'B' : b})
->>> # save list of NDArrays to hdfs.
->>> mx.nd.save("hdfs://path/to/hdfs/array", [a, b])
->>> from_file = mx.nd.load("/path/to/array/file")
->>> from_s3 = mx.nd.load("s3://path/to/s3/array")
->>> from_hdfs = mx.nd.load("hdfs://path/to/hdfs/array")
-```
-
-使用 `save` 和 `load` 的好的一方面是:
-- 你可以在所有的 `mxnet` 的其他编程语言的绑定中相同的接口.
-- 已经支持 S3 和 HDFS
-
-Multi-device Support
---------------------
-设备信息是存储在 `mxnet.Context` 数据结构中. 当我们在 mxnet 中创建 ndarray 的时候, 我们要么使用上下文参数(默认是 CPU 上下文) 在指定的设备上创建, 或者按照下面的例子中的方式使用 `with` 表达式:
-
-```python
->>> import mxnet as mx
->>> cpu_a = mx.nd.zeros((100, 200))
->>> cpu_a.context
-cpu(0)
->>> with mx.Context(mx.gpu(0)):
->>>   gpu_a = mx.nd.ones((100, 200))
->>> gpu_a.context
-gpu(0)
->>> ctx = mx.Context(mx.gpu(0))
->>> gpu_b = mx.nd.zeros((100, 200), ctx)
->>> gpu_b.context
-gpu(0)
-```
-
-现在我们还 *不支持* 涉及不同上下文环境中的多个 ndarray 的操作. 为了支持这种情况下的操作, 我们首先使用 `copyto` 方法将不同的上下文环境中的 ndarray 拷贝到同一个上下文环境中, 然后执行相应的操作:
-
-```python
->>> import mxnet as mx
->>> x = mx.nd.zeros((100, 200))
->>> with mx.Context(mx.gpu(0)):
->>>   y = mx.nd.zeros((100, 200))
->>> z = x + y
-mxnet.base.MXNetError: [13:29:12] src/ndarray/ndarray.cc:33: Check failed: lhs.ctx() == rhs.ctx() operands context mismatch
->>> cpu_y = mx.nd.zeros((100, 200))
->>> y.copyto(cpu_y)
->>> z = x + cpu_y
-```
-
-```eval_rst
-.. raw:: html
-
-    <script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
-```
-
-NDArray API Reference
----------------------
-
-```eval_rst
-.. automodule:: mxnet.ndarray
-    :members:
-
-.. raw:: html
-
-    <script>auto_index("mxnet.ndarray");</script>
-```
-
-NDArray Random API Reference
-----------------------------
-
-```eval_rst
-.. automodule:: mxnet.random
-    :members:
-
-.. raw:: html
-
-    <script>auto_index("mxnet.random");</script>
-```
-
-
-Context API Reference
----------------------
-
-```eval_rst
-.. automodule:: mxnet.context
-    :members:
-
-.. raw:: html
-
-    <script>auto_index("mxnet.context");</script>
-```
diff --git a/docs/zh/architecture/index.md b/docs/zh/architecture/index.md
index c1a50391eef9..11a495d2db9d 100644
--- a/docs/zh/architecture/index.md
+++ b/docs/zh/architecture/index.md
@@ -8,7 +8,7 @@
 这部分包含不同的角度来描述深度学习系统的设计文档, 包含对系统的抽象, 优化策略以及不同选择中如何做权衡.
 
 * [深度学习编程模型](program_model.md)
-* [深度学习依赖引擎](note_engine.md)
+* [深度学习依赖引擎](https://github.com/dmlc/mxnet/blob/master/docs/zh/mxnet-dep-engine-implemention.md)
 * [Squeeze the Memory Consumption of Deep Learning](note_memory.md)
 * [压缩深度学习的内存开销](note_data_loading.md)
 * [RNN 接口](rnn_interface.md)
diff --git a/docs/zh/overview.md b/docs/zh/overview.md
index 76b6fc1613cc..17885aa09d3d 100644
--- a/docs/zh/overview.md
+++ b/docs/zh/overview.md
@@ -27,7 +27,7 @@ refer to our [NIPS LearningSys paper](http://learningsys.org/papers/LearningSys_
 ======= =======  ========== ======== ================== ==========
 框架    Caffe     Torch      Theano    TensorFlow         MXNet
 主语言  C++       Lua        Python     C++                C++
-从语言  Python,     x         x         Python             Python, R, Julia, Scala, Javascript, Matlab, Go
+从语言  Python,     x         x         Python             Python, R, Julia, Scala, Javascript, Matlab, Go, Perl
         Matlab
 硬件    CPU,      CPU, GPU, CPU,      CPU, GPU, mobile     CPU, GPU,mobile
         GPU,      FPGA      GPU,
diff --git a/example/README.md b/example/README.md
index ecaeece40151..887d147040ad 100644
--- a/example/README.md
+++ b/example/README.md
@@ -1,4 +1,4 @@
-#Awesome MXNet
+# Awesome MXNet
 
 This page contains a curated list of awesome MXnet examples, tutorials and blogs. It is inspired by [awesome-php](https://github.com/ziadoz/awesome-php) and [awesome-machine-learning](https://github.com/josephmisiti/awesome-machine-learning).
 
@@ -22,9 +22,9 @@ This page contains a curated list of awesome MXnet examples, tutorials and blogs
 
 If you want to contribute to this list and the examples, please open a new pull request. To get started, download the [tutorial template](https://github.com/dmlc/mxnet/tree/master/example/MXNetTutorialTemplate.ipynb).
 
-##<a name="list-of-examples"></a>List of examples
+## <a name="list-of-examples"></a>List of examples
 
-###<a name="language-binding-examples"></a>Languages Binding Examples
+### <a name="language-binding-examples"></a>Languages Binding Examples
 ------------------
 * [C++ examples](https://github.com/dmlc/mxnet/tree/master/example/image-classification/predict-cpp/) - Example code for using C++ interface, including NDArray, symbolic layer and models.
 * [MXNet Python](http://mxnet.readthedocs.io/en/latest/api/python/index.html) - Python library
@@ -36,7 +36,7 @@ If you want to contribute to this list and the examples, please open a new pull
 * [MXNet Amalgamation](https://github.com/dmlc/mxnet/tree/master/amalgamation) - Amalgamation (entire library in a single file)
 * [MXNet Javascript](https://github.com/dmlc/mxnet.js/) - MXNetJS: Javascript Package for Deep Learning in Browser (without server)
 
-###<a name="deep-learning-examples"></a>Deep Learning Examples
+### <a name="deep-learning-examples"></a>Deep Learning Examples
 --------------
 * [Image classification](https://github.com/dmlc/mxnet/tree/master/example/image-classification) - Image classification on MNIST,CIFAR,ImageNet-1k,ImageNet-Full, ***with multiple GPU and distributed training***.
 * [Recurrent Neural Net](https://github.com/dmlc/mxnet/tree/master/example/rnn) - LSTM and RNN for language modeling and character level generation (Char-RNN).
@@ -75,7 +75,7 @@ If you want to contribute to this list and the examples, please open a new pull
 * [LSTM Human Activity Recognition](https://github.com/Ldpe2G/DeepLearningForFun/tree/master/Mxnet-Scala/HumanActivityRecognition) by [Ldpe2G](https://github.com/Ldpe2G)
 * [Visual Question Answering](https://github.com/liuzhi136/Visual-Question-Answering) by [liuzhi136](https://github.com/liuzhi136)
 
-###<a name="ipython-notebooks"></a>IPython Notebooks
+### <a name="ipython-notebooks"></a>IPython Notebooks
 -----------------
 * [Predict with Pre-trained model](https://github.com/dmlc/mxnet-notebooks/blob/master/python/moved-from-mxnet/predict-with-pretrained-model.ipynb) - Notebook on how to predict with pretrained model.
 * [composite symbol](https://github.com/dmlc/mxnet-notebooks/blob/master/python/moved-from-mxnet/composite_symbol.ipynb) - A demo of how to composite a symbolic Inception-BatchNorm Network
@@ -86,26 +86,26 @@ If you want to contribute to this list and the examples, please open a new pull
 * [class active maps](https://github.com/dmlc/mxnet-notebooks/blob/master/python/moved-from-mxnet/class_active_maps.ipynb) - A demo of how to localize the discriminative regions in an image using global average pooling (GAP) in CNNs.
 * [DMLC MXNet Notebooks](https://github.com/dmlc/mxnet-notebooks) DMLC's repo for various notebooks ranging from basic usages of MXNet to state-of-the-art deep learning applications.
 
-###<a name="mobile-apps-examples"></a>Mobile App Examples
+### <a name="mobile-apps-examples"></a>Mobile App Examples
 -------------------
 * [MXNet Android Classification App](https://github.com/Leliana/WhatsThis) - Image classification on Android with MXNet.
 * [MXNet iOS Classification App](https://github.com/pppoe/WhatsThis-iOS) - Image classification on iOS with MXNet.
 * [Compile MXnet on Xcode (in Chinese)](http://www.liuxiao.org/2015/12/ios-mxnet-%E7%9A%84-ios-%E7%89%88%E6%9C%AC%E7%BC%96%E8%AF%91/) - a step-by-step tutorial of compiling MXnet on Xcode for iOS app
 
-###<a name="web-predictive-services"></a>Web Predictive Services
+### <a name="web-predictive-services"></a>Web Predictive Services
 -----------------------
 * [MXNet Shinny](https://github.com/thirdwing/mxnet_shiny) - Source code for quickly creating a Shiny R app to host online image classification.
 * [Machine Eye](http://rupeshs.github.io/machineye/) - Web service for local image file/image URL classification without uploading.
 
-##<a name="list-of-tutorials"></a>List of tutorials
+## <a name="list-of-tutorials"></a>List of tutorials
 
-###<a name="gtc2016-hands-on"></a>GPU Technology Conference 2016 Hands-on session
+### <a name="gtc2016-hands-on"></a>GPU Technology Conference 2016 Hands-on session
 
 * [Video on GTC 2016 site](http://on-demand.gputechconf.com/gtc/2016/video/L6143.html)
 * [Video backup in Mainland China](http://pan.baidu.com/s/1eS58Gue)
 * [iPython Notebook](https://github.com/dmlc/mxnet-gtc-tutorial)
 
-###<a name="deep-learning-for-hackers"></a>Deep learning for hackers with MXNet
+### <a name="deep-learning-for-hackers"></a>Deep learning for hackers with MXNet
 
 * Deep learning for hackers with MXNet (1) GPU installation and MNIST [English](https://no2147483647.wordpress.com/2015/12/07/deep-learning-for-hackers-with-mxnet-1/) [Chinese](http://phunter.farbox.com/post/mxnet-tutorial1) - a tutorial of installing MXnet with GPU and introduction to deep learning by MNIST example.
 * Deep learning for hackers with MXNet (2): Neural art [English](https://no2147483647.wordpress.com/2015/12/21/deep-learning-for-hackers-with-mxnet-2/) [Chinese](http://phunter.farbox.com/post/mxnet-tutorial2) - a tutorial of generating Van Gogh style cat paintings.
@@ -116,11 +116,13 @@ If you want to contribute to this list and the examples, please open a new pull
 * [Building Deep Neural Networks in the Cloud with Azure GPU VMs, MXNet and Microsoft R Server](https://blogs.technet.microsoft.com/machinelearning/2016/09/15/building-deep-neural-networks-in-the-cloud-with-azure-gpu-vms-mxnet-and-microsoft-r-server/) by [Cortana Intelligence and ML Blog Team](https://social.technet.microsoft.com/profile/Cortana+Intelligence+and+ML+Blog+Team) at Microsoft
 * [Applying Deep Learning at Cloud Scale, with Microsoft R Server & Azure Data Lake](https://blogs.technet.microsoft.com/machinelearning/2016/10/31/applying-cloud-deep-learning-at-scale-with-microsoft-r-server-azure-data-lake/) by [Cortana Intelligence and ML Blog Team](https://social.technet.microsoft.com/profile/Cortana+Intelligence+and+ML+Blog+Team) at Microsoft
 * [Training Deep Neural Neural Networks on ImageNet Using Microsoft R Server and Azure GPU VMs](https://blogs.technet.microsoft.com/machinelearning/2016/11/15/imagenet-deep-neural-network-training-using-microsoft-r-server-and-azure-gpu-vms/) by [Cortana Intelligence and ML Blog Team](https://social.technet.microsoft.com/profile/Cortana+Intelligence+and+ML+Blog+Team) at Microsoft
+* [Cloud-Scale Text Classification with Convolutional Neural Networks on Microsoft Azure](https://blogs.technet.microsoft.com/machinelearning/2017/02/13/cloud-scale-text-classification-with-convolutional-neural-networks-on-microsoft-azure/) by [Cortana Intelligence and ML Blog Team](https://social.technet.microsoft.com/profile/Cortana+Intelligence+and+ML+Blog+Team) at Microsoft
 * [Distributed Deep Learning Made Easy](https://aws.amazon.com/blogs/compute/distributed-deep-learning-made-easy/) at AWS/Amazon for deploying deep learning clusters using MXNet
 
 ### <a name="kaggle-tutorials"></a>Kaggle tutorials
 * [Kaggle 2nd Annual Data Science Bowl End-to-End Deep Learning Tutorial (Python)](https://www.kaggle.com/c/second-annual-data-science-bowl/forums/t/18079/end-to-end-deep-learning-tutorial-0-0392) - an end-to-end python tutorial for Kaggle heart disease diagnose competition (public leaderboard score 0.0392)
 * [Kaggle 2nd Annual Data Science Bowl End-to-End Deep Learning Tutorial (R)](https://www.kaggle.com/c/second-annual-data-science-bowl/forums/t/18122/deep-learning-model-in-r) - an end-to-end R tutorial for Kaggle heart disease diagnose competition
+* [Dogs vs. Cats classification with mxnet and R](https://statist-bhfz.github.io/cats_dogs_finetune) - end-to-end (not winning) tutorial with an example of fine-tuning in R
 
 ### <a name="learning-note"></a>Learning Note
 * [Learning Note in Chinese](https://github.com/zhubuntu/MXNet-Learning-Note) - MXNet learning note in Chinese.
diff --git a/example/bayesian-methods/utils.py b/example/bayesian-methods/utils.py
index fa3c8fce1239..4a2f41d7e149 100644
--- a/example/bayesian-methods/utils.py
+++ b/example/bayesian-methods/utils.py
@@ -40,7 +40,7 @@ def get_executor(sym, ctx, data_inputs, initializer=None):
     exe = sym.bind(ctx=ctx, args=dict(params, **data_inputs),
                    args_grad=params_grad,
                    aux_states=aux_states)
-    if initializer != None:
+    if initializer is not None:
         for k, v in params.items():
             initializer(k, v)
     return exe, params, params_grad, aux_states
diff --git a/example/bi-lstm-sort/sort_io.py b/example/bi-lstm-sort/sort_io.py
index f446fa5786d2..8e1152173ac7 100644
--- a/example/bi-lstm-sort/sort_io.py
+++ b/example/bi-lstm-sort/sort_io.py
@@ -118,11 +118,11 @@ def __init__(self, path, vocab, buckets, batch_size,
                  seperate_char=' <eos> ', text2id=None, read_content=None):
         super(BucketSentenceIter, self).__init__()
 
-        if text2id == None:
+        if text2id is None:
             self.text2id = default_text2id
         else:
             self.text2id = text2id
-        if read_content == None:
+        if read_content is None:
             self.read_content = default_read_content
         else:
             self.read_content = read_content
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
index 0fb295a2b916..9a51f07bda87 100644
--- a/example/caffe/train_model.py
+++ b/example/caffe/train_model.py
@@ -78,11 +78,11 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
         initializer        = mx.init.Xavier(factor_type="in", magnitude=2.34),
         **model_args)
 
-    if eval_metrics == None:
+    if eval_metrics is None:
         eval_metrics = ['accuracy']
         ## TopKAccuracy only allows top_k > 1
         for top_k in [5, 10, 20]:
-            eval_metrics.append(mx.metric.create('top_k_accuracy', top_k = top_k))
+            eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=top_k))
 
     if batch_end_callback is not None:
         if not isinstance(batch_end_callback, list):
diff --git a/example/dec/dec.py b/example/dec/dec.py
index 6036b995d555..ec78da8ee755 100644
--- a/example/dec/dec.py
+++ b/example/dec/dec.py
@@ -1,4 +1,5 @@
 # pylint: skip-file
+from __future__ import print_function
 import sys
 import os
 # code to automatically download dataset
@@ -116,15 +117,15 @@ def refresh(i):
                 p = np.zeros((z.shape[0], self.num_centers))
                 self.dec_op.forward([z, args['dec_mu'].asnumpy()], [p])
                 y_pred = p.argmax(axis=1)
-                print np.std(np.bincount(y_pred)), np.bincount(y_pred)
-                print np.std(np.bincount(y.astype(np.int))), np.bincount(y.astype(np.int))
+                print(np.std(np.bincount(y_pred)), np.bincount(y_pred))
+                print(np.std(np.bincount(y.astype(np.int))), np.bincount(y.astype(np.int)))
                 if y is not None:
                     print(cluster_acc(y_pred, y)[0])
                 weight = 1.0/p.sum(axis=0)
                 weight *= self.num_centers/weight.sum()
                 p = (p**2)*weight
                 train_iter.data_list[1][:] = (p.T/p.sum(axis=1)).T
-                print np.sum(y_pred != self.y_pred), 0.001*y_pred.shape[0]
+                print(np.sum(y_pred != self.y_pred), 0.001*y_pred.shape[0])
                 if np.sum(y_pred != self.y_pred) < 0.001*y_pred.shape[0]:
                     self.y_pred = y_pred
                     return True
diff --git a/example/fcn-xs/image_segmentaion.py b/example/fcn-xs/image_segmentaion.py
index a09744773596..6d619c198c0b 100644
--- a/example/fcn-xs/image_segmentaion.py
+++ b/example/fcn-xs/image_segmentaion.py
@@ -4,22 +4,22 @@
 from PIL import Image
 
 def getpallete(num_cls):
-        # this function is to get the colormap for visualizing the segmentation mask
-        n = num_cls
-        pallete = [0]*(n*3)
-        for j in xrange(0,n):
-                lab = j
-                pallete[j*3+0] = 0
-                pallete[j*3+1] = 0
-                pallete[j*3+2] = 0
-                i = 0
-                while (lab > 0):
-                        pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
-                        pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
-                        pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
-                        i = i + 1
-                        lab >>= 3
-         return pallete
+    # this function is to get the colormap for visualizing the segmentation mask
+    n = num_cls
+    pallete = [0]*(n*3)
+    for j in xrange(0,n):
+            lab = j
+            pallete[j*3+0] = 0
+            pallete[j*3+1] = 0
+            pallete[j*3+2] = 0
+            i = 0
+            while (lab > 0):
+                    pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
+                    pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
+                    pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
+                    i = i + 1
+                    lab >>= 3
+    return pallete
 
 pallete = getpallete(256)
 img = "./person_bicycle.jpg"
diff --git a/example/fcn-xs/solver.py b/example/fcn-xs/solver.py
index 953e0a986fd2..d56ba0fe34d5 100644
--- a/example/fcn-xs/solver.py
+++ b/example/fcn-xs/solver.py
@@ -92,7 +92,7 @@ def fit(self, train_data, eval_data=None,
                 self.exector.outputs[0].wait_to_read()
                 batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric)
                 batch_end_callback(batch_end_params)
-            if epoch_end_callback != None:
+            if epoch_end_callback is not None:
                 epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params)
             name, value = eval_metric.get()
             logger.info("                     --->Epoch[%d] Train-%s=%f", epoch, name, value)
diff --git a/example/gan/dcgan.py b/example/gan/dcgan.py
index 5a86a3bdcc54..2e141219232e 100644
--- a/example/gan/dcgan.py
+++ b/example/gan/dcgan.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import mxnet as mx
 import numpy as np
 from sklearn.datasets import fetch_mldata
@@ -208,7 +209,7 @@ def fentropy(label, pred):
     mD = mx.metric.CustomMetric(fentropy)
     mACC = mx.metric.CustomMetric(facc)
 
-    print 'Training...'
+    print('Training...')
     stamp =  datetime.now().strftime('%Y_%m_%d-%H_%M')
 
     # =============train===============
@@ -262,7 +263,7 @@ def fentropy(label, pred):
 
             t += 1
             if t % 10 == 0:
-                print 'epoch:', epoch, 'iter:', t, 'metric:', mACC.get(), mG.get(), mD.get()
+                print('epoch:', epoch, 'iter:', t, 'metric:', mACC.get(), mG.get(), mD.get())
                 mACC.reset()
                 mG.reset()
                 mD.reset()
@@ -274,7 +275,7 @@ def fentropy(label, pred):
                 visual('data', batch.data[0].asnumpy())
 
         if check_point:
-            print 'Saving...'
+            print('Saving...')
             modG.save_params('%s_G_%s-%04d.params'%(dataset, stamp, epoch))
             modD.save_params('%s_D_%s-%04d.params'%(dataset, stamp, epoch))
 
diff --git a/example/image-classification/README.md b/example/image-classification/README.md
index 41e91fc722b5..8e9117114849 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -220,7 +220,7 @@ file named `hosts`. The outputs of `cat hosts` may be
 Now we can run the previous cifar10 training on two machines:
 
 ```bash
-../../tools/launch.py -n 2 -H hosts \
+python ../../tools/launch.py -n 2 -H hosts \
     python train_cifar10.py --network resnet --num-layers 110 --batch-size 128 --gpus 0,1 \
     --kv-store dist_device_sync
 ```
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
old mode 100644
new mode 100755
index 28b31ecd2eab..e9bb4abc0814
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -19,6 +19,8 @@ def add_data_args(parser):
                       help='number of threads for data decoding')
     data.add_argument('--benchmark', type=int, default=0,
                       help='if 1, then feed the network with synthetic data')
+    data.add_argument('--dtype', type=str, default='float32',
+                      help='data type: float32 or float16')
     return data
 
 def add_data_aug_args(parser):
@@ -56,22 +58,23 @@ def set_data_aug_level(aug, level):
 
 
 class SyntheticDataIter(DataIter):
-    def __init__(self, num_classes, data_shape, max_iter):
+    def __init__(self, num_classes, data_shape, max_iter, dtype):
         self.batch_size = data_shape[0]
         self.cur_iter = 0
         self.max_iter = max_iter
+        self.dtype = dtype
         label = np.random.randint(0, num_classes, [self.batch_size,])
         data = np.random.uniform(-1, 1, data_shape)
-        self.data = mx.nd.array(data)
-        self.label = mx.nd.array(label)
+        self.data = mx.nd.array(data, dtype=self.dtype)
+        self.label = mx.nd.array(label, dtype=self.dtype)
     def __iter__(self):
         return self
     @property
     def provide_data(self):
-        return [('data',self.data.shape)]
+        return [mx.io.DataDesc('data', self.data.shape, self.dtype)]
     @property
     def provide_label(self):
-        return [('softmax_label',(self.batch_size,))]
+        return [mx.io.DataDesc('softmax_label', (self.batch_size,), self.dtype)]
     def next(self):
         self.cur_iter += 1
         if self.cur_iter <= self.max_iter:
@@ -90,9 +93,13 @@ def reset(self):
 
 def get_rec_iter(args, kv=None):
     image_shape = tuple([int(l) for l in args.image_shape.split(',')])
+    dtype = np.float32;
+    if 'dtype' in args:
+        if args.dtype == 'float16':
+            dtype = np.float16
     if 'benchmark' in args and args.benchmark:
         data_shape = (args.batch_size,) + image_shape
-        train = SyntheticDataIter(args.num_classes, data_shape, 50)
+        train = SyntheticDataIter(args.num_classes, data_shape, 50, dtype)
         return (train, None)
     if kv:
         (rank, nworker) = (kv.rank, kv.num_workers)
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
old mode 100644
new mode 100755
index a483dad667af..6d79385cb6ff
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -150,8 +150,12 @@ def fit(args, network, data_loader, **kwargs):
 
     monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None
 
-    initializer   = mx.init.Xavier(
-       rnd_type='gaussian', factor_type="in", magnitude=2)
+    if args.network == 'alexnet':
+        # AlexNet will not converge using Xavier
+        initializer = mx.init.Normal()
+    else:
+        initializer = mx.init.Xavier(
+            rnd_type='gaussian', factor_type="in", magnitude=2)
     # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
 
     # evaluation metrices
diff --git a/example/image-classification/common/util.py b/example/image-classification/common/util.py
index 149e038e63ad..a25e2181be92 100644
--- a/example/image-classification/common/util.py
+++ b/example/image-classification/common/util.py
@@ -20,8 +20,6 @@ def download_file(url, local_fname=None, force_write=False):
                 if exc.errno != errno.EEXIST:
                     raise
 
-
-
     r = requests.get(url, stream=True)
     assert r.status_code == 200, "failed to open %s" % url
     with open(local_fname, 'wb') as f:
diff --git a/example/image-classification/predict-cpp/CMakeLists.txt b/example/image-classification/predict-cpp/CMakeLists.txt
index 84055374abcb..646f907f8f6d 100644
--- a/example/image-classification/predict-cpp/CMakeLists.txt
+++ b/example/image-classification/predict-cpp/CMakeLists.txt
@@ -7,6 +7,9 @@ if(USE_OPENCV)
   include_directories(SYSTEM ${OpenCV_INCLUDE_DIRS})
   target_link_libraries(image-classification-predict mxnet)
   target_link_libraries(image-classification-predict ${OpenCV_LIBS})
+  if(UNIX)
+      target_link_libraries(image-classification-predict rt)  
+  endif()
   list(APPEND mxnet_LINKER_LIBS ${OpenCV_LIBS})
 endif()
 
diff --git a/example/image-classification/score.py b/example/image-classification/score.py
index 093a61077f37..d26ddddf9b83 100644
--- a/example/image-classification/score.py
+++ b/example/image-classification/score.py
@@ -5,37 +5,47 @@
 import os
 import logging
 
-
-def score(model, data_val, metrics, gpus, batch_size, rgb_mean,
-          image_shape='3,224,224', data_nthreads=4):
+def score(model, data_val, metrics, gpus, batch_size, rgb_mean=None, mean_img=None,
+          image_shape='3,224,224', data_nthreads=4, label_name='softmax_label', max_num_examples=None):
     # create data iterator
-    rgb_mean = [float(i) for i in rgb_mean.split(',')]
     data_shape = tuple([int(i) for i in image_shape.split(',')])
+    if mean_img is not None:
+        mean_args = {'mean_img':mean_img}
+    elif rgb_mean is not None:
+        rgb_mean = [float(i) for i in rgb_mean.split(',')]
+        mean_args = {'mean_r':rgb_mean[0], 'mean_g':rgb_mean[1],
+          'mean_b':rgb_mean[2]}
+
     data = mx.io.ImageRecordIter(
         path_imgrec        = data_val,
         label_width        = 1,
-        mean_r             = rgb_mean[0],
-        mean_g             = rgb_mean[1],
-        mean_b             = rgb_mean[2],
         preprocess_threads = data_nthreads,
         batch_size         = batch_size,
         data_shape         = data_shape,
+        label_name         = label_name,
         rand_crop          = False,
-        rand_mirror        = False)
+        rand_mirror        = False,
+        **mean_args)
 
-    # download model
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    (prefix, epoch) = modelzoo.download_model(
-        model, os.path.join(dir_path, 'model'))
+    if isinstance(model, str):
+        # download model
+        dir_path = os.path.dirname(os.path.realpath(__file__))
+        (prefix, epoch) = modelzoo.download_model(
+            model, os.path.join(dir_path, 'model'))
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+    elif isinstance(model, tuple) or isinstance(model, list):
+        assert len(model) == 3
+        (sym, arg_params, aux_params) = model
+    else:
+        raise TypeError('model type [%s] is not supported' % str(type(model)))
 
     # create module
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
     if gpus == '':
         devs = mx.cpu()
     else:
         devs = [mx.gpu(int(i)) for i in gpus.split(',')]
 
-    mod = mx.mod.Module(symbol=sym, context=devs)
+    mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name,])
     mod.bind(for_training=False,
              data_shapes=data.provide_data,
              label_shapes=data.provide_label)
@@ -49,6 +59,8 @@ def score(model, data_val, metrics, gpus, batch_size, rgb_mean,
         for m in metrics:
             mod.update_metric(m, batch.label)
         num += batch_size
+        if max_num_examples is not None and num > max_num_examples:
+            break
     return (num / (time.time() - tic), )
 
 
diff --git a/example/image-classification/symbols/alexnet.py b/example/image-classification/symbols/alexnet.py
old mode 100644
new mode 100755
index dda32a34b3af..4931c269352b
--- a/example/image-classification/symbols/alexnet.py
+++ b/example/image-classification/symbols/alexnet.py
@@ -8,39 +8,39 @@
 def get_symbol(num_classes, **kwargs):
     input_data = mx.symbol.Variable(name="data")
     # stage 1
-    conv1 = mx.symbol.Convolution(
+    conv1 = mx.symbol.Convolution(name='conv1',
         data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96)
     relu1 = mx.symbol.Activation(data=conv1, act_type="relu")
+    lrn1 = mx.symbol.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
     pool1 = mx.symbol.Pooling(
-        data=relu1, pool_type="max", kernel=(3, 3), stride=(2,2))
-    lrn1 = mx.symbol.LRN(data=pool1, alpha=0.0001, beta=0.75, knorm=1, nsize=5)
+        data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2))
     # stage 2
-    conv2 = mx.symbol.Convolution(
-        data=lrn1, kernel=(5, 5), pad=(2, 2), num_filter=256)
+    conv2 = mx.symbol.Convolution(name='conv2',
+        data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=256)
     relu2 = mx.symbol.Activation(data=conv2, act_type="relu")
-    pool2 = mx.symbol.Pooling(data=relu2, kernel=(3, 3), stride=(2, 2), pool_type="max")
-    lrn2 = mx.symbol.LRN(data=pool2, alpha=0.0001, beta=0.75, knorm=1, nsize=5)
+    lrn2 = mx.symbol.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
+    pool2 = mx.symbol.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max")
     # stage 3
-    conv3 = mx.symbol.Convolution(
-        data=lrn2, kernel=(3, 3), pad=(1, 1), num_filter=384)
+    conv3 = mx.symbol.Convolution(name='conv3',
+        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=384)
     relu3 = mx.symbol.Activation(data=conv3, act_type="relu")
-    conv4 = mx.symbol.Convolution(
+    conv4 = mx.symbol.Convolution(name='conv4',
         data=relu3, kernel=(3, 3), pad=(1, 1), num_filter=384)
     relu4 = mx.symbol.Activation(data=conv4, act_type="relu")
-    conv5 = mx.symbol.Convolution(
+    conv5 = mx.symbol.Convolution(name='conv5',
         data=relu4, kernel=(3, 3), pad=(1, 1), num_filter=256)
     relu5 = mx.symbol.Activation(data=conv5, act_type="relu")
     pool3 = mx.symbol.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2), pool_type="max")
     # stage 4
     flatten = mx.symbol.Flatten(data=pool3)
-    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096)
+    fc1 = mx.symbol.FullyConnected(name='fc1', data=flatten, num_hidden=4096)
     relu6 = mx.symbol.Activation(data=fc1, act_type="relu")
     dropout1 = mx.symbol.Dropout(data=relu6, p=0.5)
     # stage 5
-    fc2 = mx.symbol.FullyConnected(data=dropout1, num_hidden=4096)
+    fc2 = mx.symbol.FullyConnected(name='fc2', data=dropout1, num_hidden=4096)
     relu7 = mx.symbol.Activation(data=fc2, act_type="relu")
     dropout2 = mx.symbol.Dropout(data=relu7, p=0.5)
     # stage 6
-    fc3 = mx.symbol.FullyConnected(data=dropout2, num_hidden=num_classes)
+    fc3 = mx.symbol.FullyConnected(name='fc3', data=dropout2, num_hidden=num_classes)
     softmax = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
     return softmax
diff --git a/example/image-classification/symbols/alexnet_fp16.py b/example/image-classification/symbols/alexnet_fp16.py
new file mode 100755
index 000000000000..94440812618f
--- /dev/null
+++ b/example/image-classification/symbols/alexnet_fp16.py
@@ -0,0 +1,85 @@
+"""
+Reference:
+
+Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012.
+"""
+import mxnet as mx
+import numpy as np
+
+def get_symbol(num_classes, **kwargs):
+    input_data = mx.symbol.Variable(name="data")
+    input_data = mx.symbol.Cast(data=input_data, dtype=np.float16)
+    # stage 1
+    weight = mx.symbol.Variable(name='conv1_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='conv1_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    conv1 = mx.symbol.Convolution(name='conv1',
+        data=input_data, weight=weight, bias=bias, kernel=(11, 11), stride=(4, 4), num_filter=96)
+    relu1 = mx.symbol.Activation(data=conv1, act_type="relu")
+    lrn1 = mx.symbol.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
+    pool1 = mx.symbol.Pooling(
+        data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2))
+    # stage 2
+    weight = mx.symbol.Variable(name='conv2_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='conv2_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    conv2 = mx.symbol.Convolution(name='conv2',
+        data=pool1, weight=weight, bias=bias, kernel=(5, 5), pad=(2, 2), num_filter=256)
+    relu2 = mx.symbol.Activation(data=conv2, act_type="relu")
+    lrn2 = mx.symbol.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
+    pool2 = mx.symbol.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max")
+    # stage 3
+    weight = mx.symbol.Variable(name='conv3_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='conv3_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    conv3 = mx.symbol.Convolution(name='conv3',
+        data=pool2, weight=weight, bias=bias, kernel=(3, 3), pad=(1, 1), num_filter=384)
+    relu3 = mx.symbol.Activation(data=conv3, act_type="relu")
+    weight = mx.symbol.Variable(name='conv4_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='conv4_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    conv4 = mx.symbol.Convolution(name='conv4',
+        data=relu3, weight=weight, bias=bias, kernel=(3, 3), pad=(1, 1), num_filter=384)
+    relu4 = mx.symbol.Activation(data=conv4, act_type="relu")
+    weight = mx.symbol.Variable(name='conv5_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='conv5_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    conv5 = mx.symbol.Convolution(name='conv5',
+        data=relu4, weight=weight, bias=bias, kernel=(3, 3), pad=(1, 1), num_filter=256)
+    relu5 = mx.symbol.Activation(data=conv5, act_type="relu")
+    pool3 = mx.symbol.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2), pool_type="max")
+    # stage 4
+    flatten = mx.symbol.Flatten(data=pool3)
+    weight = mx.symbol.Variable(name='fc1_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='fc1_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    fc1 = mx.symbol.FullyConnected(name='fc1', data=flatten, weight=weight, bias=bias,
+        num_hidden=4096)
+    relu6 = mx.symbol.Activation(data=fc1, act_type="relu")
+    dropout1 = mx.symbol.Dropout(data=relu6, p=0.5)
+    # stage 5
+    weight = mx.symbol.Variable(name='fc2_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='fc2_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    fc2 = mx.symbol.FullyConnected(name='fc2', data=dropout1, weight=weight, bias=bias,
+        num_hidden=4096)
+    relu7 = mx.symbol.Activation(data=fc2, act_type="relu")
+    dropout2 = mx.symbol.Dropout(data=relu7, p=0.5)
+    # stage 6
+    weight = mx.symbol.Variable(name='fc3_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='fc3_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    fc3 = mx.symbol.FullyConnected(name='fc3', data=dropout2, weight=weight, bias=bias,
+        num_hidden=num_classes)
+    label = mx.symbol.Variable(name='softmax_label')
+    label = mx.symbol.Cast(data=label, dtype=np.float16)
+    softmax = mx.symbol.SoftmaxOutput(data=fc3, name='softmax', label=label)
+    return softmax
diff --git a/example/image-classification/symbols/resnet_fp16.py b/example/image-classification/symbols/resnet_fp16.py
new file mode 100755
index 000000000000..cd533668e8e8
--- /dev/null
+++ b/example/image-classification/symbols/resnet_fp16.py
@@ -0,0 +1,192 @@
+'''
+Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
+Original author Wei Wu
+
+Implemented the following paper:
+
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
+'''
+import mxnet as mx
+import numpy as np
+
+def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet Unit symbol for building ResNet
+    Parameters
+    ----------
+    data : str
+        Input data
+    num_filter : int
+        Number of output channels
+    bnf : int
+        Bottle neck channels factor with regard to num_filter
+    stride : tupe
+        Stride used in convolution
+    dim_match : Boolen
+        True means channel number between input and output is the same, otherwise means differ
+    name : str
+        Base name of the operators
+    workspace : int
+        Workspace used in convolution operator
+    """
+    if bottle_neck:
+        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        weight = mx.symbol.Variable(name=name + '_conv1_weight', dtype=np.float32)
+        weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+        conv1 = mx.sym.Convolution(data=act1, weight=weight, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
+                                   no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        weight = mx.symbol.Variable(name=name + '_conv2_weight', dtype=np.float32)
+        weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+        conv2 = mx.sym.Convolution(data=act2, weight=weight, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
+                                   no_bias=True, workspace=workspace, name=name + '_conv2')
+        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
+        weight = mx.symbol.Variable(name=name + '_conv3_weight', dtype=np.float32)
+        weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+        conv3 = mx.sym.Convolution(data=act3, weight=weight, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                                   workspace=workspace, name=name + '_conv3')
+        if dim_match:
+            shortcut = data
+        else:
+            weight = mx.symbol.Variable(name=name + '_sc_weight', dtype=np.float32)
+            weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+            shortcut = mx.sym.Convolution(data=act1, weight=weight, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv3 + shortcut
+    else:
+        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
+        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
+        weight = mx.symbol.Variable(name=name + '_conv1_weight', dtype=np.float32)
+        weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+        conv1 = mx.sym.Convolution(data=act1, weight=weight, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv1')
+        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
+        weight = mx.symbol.Variable(name=name + '_conv2_weight', dtype=np.float32)
+        weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+        conv2 = mx.sym.Convolution(data=act2, weight=weight, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+                                      no_bias=True, workspace=workspace, name=name + '_conv2')
+        if dim_match:
+            shortcut = data
+        else:
+            weight = mx.symbol.Variable(name=name + '_sc_weight', dtype=np.float32)
+            weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+            shortcut = mx.sym.Convolution(data=act1, weight=weight, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                            workspace=workspace, name=name+'_sc')
+        if memonger:
+            shortcut._set_attr(mirror_stage='True')
+        return conv2 + shortcut
+
+def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
+    """Return ResNet symbol of
+    Parameters
+    ----------
+    units : list
+        Number of units in each stage
+    num_stages : int
+        Number of stage
+    filter_list : list
+        Channel size of each stage
+    num_classes : int
+        Ouput size of symbol
+    dataset : str
+        Dataset type, only cifar10 and imagenet supports
+    workspace : int
+        Workspace used in convolution operator
+    """
+    num_unit = len(units)
+    assert(num_unit == num_stages)
+    data = mx.sym.Variable(name='data')
+    data = mx.symbol.Cast(data=data, dtype=np.float16)
+    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
+    (nchannel, height, width) = image_shape
+    weight = mx.symbol.Variable(name='conv0_weight', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    if height <= 32:            # such as cifar10
+        body = mx.sym.Convolution(data=data, weight=weight, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
+                                  no_bias=True, name="conv0", workspace=workspace)
+    else:                       # often expected to be 224 such as imagenet
+        body = mx.sym.Convolution(data=data, weight=weight, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
+                                  no_bias=True, name="conv0", workspace=workspace)
+        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
+        body = mx.symbol.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
+
+    for i in range(num_stages):
+        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
+                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
+                             memonger=memonger)
+        for j in range(units[i]-1):
+            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
+                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
+    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
+    # Although kernel is not used here when global_pool=True, we should put one
+    pool1 = mx.symbol.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
+    flat = mx.symbol.Flatten(data=pool1)
+    weight = mx.symbol.Variable(name='fc1_weight', dtype=np.float32)
+    bias = mx.symbol.Variable(name='fc1_bias', dtype=np.float32)
+    weight = mx.symbol.Cast(data=weight, dtype=np.float16)
+    bias = mx.symbol.Cast(data=bias, dtype=np.float16)
+    fc1 = mx.symbol.FullyConnected(data=flat, weight=weight, bias=bias, num_hidden=num_classes, name='fc1')
+    label = mx.symbol.Variable(name='softmax_label')
+    label = mx.symbol.Cast(data=label, dtype=np.float16)
+    return mx.symbol.SoftmaxOutput(data=fc1, name='softmax', label=label)
+
+def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, **kwargs):
+    """
+    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
+    Original author Wei Wu
+    """
+    image_shape = [int(l) for l in image_shape.split(',')]
+    (nchannel, height, width) = image_shape
+    if height <= 28:
+        num_stages = 3
+        if (num_layers-2) % 9 == 0 and num_layers >= 164:
+            per_unit = [(num_layers-2)//9]
+            filter_list = [16, 64, 128, 256]
+            bottle_neck = True
+        elif (num_layers-2) % 6 == 0 and num_layers < 164:
+            per_unit = [(num_layers-2)//6]
+            filter_list = [16, 16, 32, 64]
+            bottle_neck = False
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it youself".format(num_layers))
+        units = per_unit * num_stages
+    else:
+        if num_layers >= 50:
+            filter_list = [64, 256, 512, 1024, 2048]
+            bottle_neck = True
+        else:
+            filter_list = [64, 64, 128, 256, 512]
+            bottle_neck = False
+        num_stages = 4
+        if num_layers == 18:
+            units = [2, 2, 2, 2]
+        elif num_layers == 34:
+            units = [3, 4, 6, 3]
+        elif num_layers == 50:
+            units = [3, 4, 6, 3]
+        elif num_layers == 101:
+            units = [3, 4, 23, 3]
+        elif num_layers == 152:
+            units = [3, 8, 36, 3]
+        elif num_layers == 200:
+            units = [3, 24, 36, 3]
+        elif num_layers == 269:
+            units = [3, 30, 48, 8]
+        else:
+            raise ValueError("no experiments done on num_layers {}, you can do it youself".format(num_layers))
+
+    return resnet(units       = units,
+                  num_stages  = num_stages,
+                  filter_list = filter_list,
+                  num_classes = num_classes,
+                  image_shape = image_shape,
+                  bottle_neck = bottle_neck,
+                  workspace   = conv_workspace)
diff --git a/example/image-classification/test_score.py b/example/image-classification/test_score.py
index d0f527a740c2..19a1d3072664 100644
--- a/example/image-classification/test_score.py
+++ b/example/image-classification/test_score.py
@@ -4,43 +4,42 @@
 from __future__ import print_function
 import mxnet as mx
 from common import find_mxnet, modelzoo
-from common.util import download_file, get_gpus
 from score import score
 
+VAL_DATA='data/val-5k-256.rec'
 def download_data():
-    download_file('http://data.mxnet.io/data/val-5k-256.rec', 'data/val-5k-256.rec')
+    return mx.test_utils.download(
+        'http://data.mxnet.io/data/val-5k-256.rec', VAL_DATA)
 
 def test_imagenet1k_resnet(**kwargs):
-    models = ['imagenet1k-resnet-34',
-              'imagenet1k-resnet-50',
-              'imagenet1k-resnet-101',
-              'imagenet1k-resnet-152']
-    accs = [.72, .75, .765, .76]
+    models = ['imagenet1k-resnet-50', 'imagenet1k-resnet-152']
+    accs = [.77, .78]
     for (m, g) in zip(models, accs):
         acc = mx.metric.create('acc')
-        (speed,) = score(model=m, data_val='data/val-5k-256.rec',
+        (speed,) = score(model=m, data_val=VAL_DATA,
                          rgb_mean='0,0,0', metrics=acc, **kwargs)
         r = acc.get()[1]
-        print('testing %s, acc = %f, speed = %f img/sec' % (m, r, speed))
+        print('Tested %s, acc = %f, speed = %f img/sec' % (m, r, speed))
         assert r > g and r < g + .1
 
 def test_imagenet1k_inception_bn(**kwargs):
     acc = mx.metric.create('acc')
     m = 'imagenet1k-inception-bn'
-    g = 0.72
+    g = 0.75
     (speed,) = score(model=m,
-                     data_val='data/val-5k-256.rec',
+                     data_val=VAL_DATA,
                      rgb_mean='123.68,116.779,103.939', metrics=acc, **kwargs)
     r = acc.get()[1]
     print('Tested %s acc = %f, speed = %f img/sec' % (m, r, speed))
     assert r > g and r < g + .1
 
 if __name__ == '__main__':
-    gpus = get_gpus()
+    gpus = mx.test_utils.list_gpus()
     assert len(gpus) > 0
     batch_size = 16 * len(gpus)
     gpus = ','.join([str(i) for i in gpus])
 
+    kwargs = {'gpus':gpus, 'batch_size':batch_size, 'max_num_examples':500}
     download_data()
-    test_imagenet1k_resnet(gpus=gpus, batch_size=batch_size)
-    test_imagenet1k_inception_bn(gpus=gpus, batch_size=batch_size)
+    test_imagenet1k_resnet(**kwargs)
+    test_imagenet1k_inception_bn(**kwargs)
diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
index 4243ddd23bda..61162e637520 100644
--- a/example/image-classification/train_mnist.py
+++ b/example/image-classification/train_mnist.py
@@ -59,8 +59,8 @@ def get_mnist_iter(args, kv):
         network        = 'mlp',
         # train
         gpus           = None,
-        batch_size      = 64,
-        disp_batches = 100,
+        batch_size     = 64,
+        disp_batches   = 100,
         num_epochs     = 20,
         lr             = .05,
         lr_step_epochs = '10',
diff --git a/example/kaggle-ndsb1/gen_img_list.py b/example/kaggle-ndsb1/gen_img_list.py
index 708d2ddc2677..2da5d7097d96 100644
--- a/example/kaggle-ndsb1/gen_img_list.py
+++ b/example/kaggle-ndsb1/gen_img_list.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import csv
 import os
 import sys
@@ -72,13 +73,13 @@
         # unique_train, counts_train = np.unique(labels_train, return_counts=True) # To have a look at the frecuency distribution
         sss = StratifiedShuffleSplit(labels_train, 1, test_size=args.percent_val, random_state=0)
         for tr_idx, va_idx in sss:
-            print "Train subset has ", len(tr_idx), " cases. Validation subset has ", len(va_idx), "cases"
+            print("Train subset has ", len(tr_idx), " cases. Validation subset has ", len(va_idx), "cases")
     else:
         (nRows, nCols) = img_lst.shape
         splitat=int(round(nRows*(1-args.percent_val),0))
         tr_idx=range(0,splitat)
         va_idx=range(splitat,nRows)
-        print "Train subset has ", len(tr_idx), " cases. Validation subset has ", len(va_idx), "cases" 
+        print("Train subset has ", len(tr_idx), " cases. Validation subset has ", len(va_idx), "cases")
 
     tr_lst=img_lst[tr_idx,:].tolist()
     va_lst=img_lst[va_idx,:].tolist()
diff --git a/example/kaggle-ndsb1/predict_dsb.py b/example/kaggle-ndsb1/predict_dsb.py
index 5241730120c4..483243a430a7 100644
--- a/example/kaggle-ndsb1/predict_dsb.py
+++ b/example/kaggle-ndsb1/predict_dsb.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import find_mxnet
 import submission_dsb
 import mxnet as mx
@@ -45,7 +46,7 @@
 # generate matrix of prediction prob
 tic=time.time()
 predictions = model.predict(test)
-print "Time required for prediction", time.time()-tic
+print("Time required for prediction", time.time()-tic)
 
 
 # create submission csv file to submit to kaggle
diff --git a/example/kaggle-ndsb1/submission_dsb.py b/example/kaggle-ndsb1/submission_dsb.py
index 1ae4f4acb499..a2644f8d2a74 100644
--- a/example/kaggle-ndsb1/submission_dsb.py
+++ b/example/kaggle-ndsb1/submission_dsb.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import pandas as pd
 import os
 import time as time
@@ -28,13 +29,13 @@ def gen_sub(predictions,test_lst_path="test.lst",submission_path="submission.csv
     df = pd.DataFrame(predictions,columns = header, index=img_lst)
     df.index.name = 'image'
     
-    print "Saving csv to %s" % submission_path
+    print("Saving csv to %s" % submission_path)
     df.to_csv(submission_path)
      
-    print "Compress with gzip"
+    print("Compress with gzip")
     os.system("gzip -f %s" % submission_path)
     
-    print "  stored in %s.gz" % submission_path
+    print("  stored in %s.gz" % submission_path)
 
    
 
diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
index 0ad13411c300..64d15e036d6d 100644
--- a/example/kaggle-ndsb2/Preprocessing.py
+++ b/example/kaggle-ndsb2/Preprocessing.py
@@ -50,7 +50,7 @@ def write_label_csv(fname, frames, label_map):
    fo = open(fname, "w")
    for lst in frames:
        index = int(lst[0].split("/")[3])
-       if label_map != None:
+       if label_map is not None:
            fo.write(label_map[index])
        else:
            fo.write("%d,0,0\n" % index)
@@ -69,7 +69,7 @@ def get_data(lst,preproc):
        data.append(img)
    data = np.array(data, dtype=np.uint8)
    data = data.reshape(data.size)
-   data = np.array(data,dtype=np.str_)
+   data = np.array(data, dtype=np.str_)
    data = data.reshape(data.size)
    return [data,result]
 
diff --git a/example/model-parallel-lstm/lstm.py b/example/model-parallel-lstm/lstm.py
index cb11886b981a..795eb6ef3bc1 100644
--- a/example/model-parallel-lstm/lstm.py
+++ b/example/model-parallel-lstm/lstm.py
@@ -199,7 +199,7 @@ def setup_rnn_model(default_ctx,
                                 grad_req="add", group2ctx=group2ctx)
               max_rnn_exec = rnn_exec
         else:
-              assert max_rnn_exec != None
+              assert max_rnn_exec is not None
               rnn_exec = rnn_sym.bind(default_ctx, args=arg_arrays,
                             args_grad=args_grad,
                             grad_req="add", group2ctx=group2ctx,
diff --git a/example/model-parallel-lstm/lstm_ptb.py b/example/model-parallel-lstm/lstm_ptb.py
index 05c2b4536870..20ce89653fd8 100644
--- a/example/model-parallel-lstm/lstm_ptb.py
+++ b/example/model-parallel-lstm/lstm_ptb.py
@@ -23,7 +23,7 @@ def load_data(path, dic=None):
     content = content.split(' ')
     print("Loading %s, size of data = %d" % (path, len(content)))
     x = np.zeros(len(content))
-    if dic == None:
+    if dic is None:
         dic = {}
     idx = 0
     for i in range(len(content)):
diff --git a/example/multi-task/example_multi_task.py b/example/multi-task/example_multi_task.py
index 216ed71c136b..8ee396f0daf4 100644
--- a/example/multi-task/example_multi_task.py
+++ b/example/multi-task/example_multi_task.py
@@ -64,7 +64,7 @@ def __init__(self, num=None):
     def update(self, labels, preds):
         mx.metric.check_label_shapes(labels, preds)
 
-        if self.num != None:
+        if self.num is not None:
             assert len(labels) == self.num
 
         for i in range(len(labels)):
@@ -73,7 +73,7 @@ def update(self, labels, preds):
 
             mx.metric.check_label_shapes(label, pred_label)
 
-            if i == None:
+            if i is None:
                 self.sum_metric += (pred_label.flat == label.flat).sum()
                 self.num_inst += len(pred_label.flat)
             else:
diff --git a/example/nce-loss/lstm_word.py b/example/nce-loss/lstm_word.py
index 5c4fb8d60714..3b39207b58a3 100644
--- a/example/nce-loss/lstm_word.py
+++ b/example/nce-loss/lstm_word.py
@@ -1,4 +1,5 @@
 # pylint:skip-file
+from __future__ import print_function
 import logging
 import sys, random, time, math
 sys.path.insert(0, "../../python")
@@ -139,7 +140,7 @@ def __init__(self, name, batch_size, seq_len, num_label, init_states):
         self.batch_size = batch_size
         self.data, self.negative, self.vocab, self.freq = load_data(name)
         self.vocab_size = 1 + len(self.vocab)
-        print self.vocab_size
+        print(self.vocab_size)
         self.seq_len = seq_len
         self.num_label = num_label
         self.init_states = init_states
@@ -153,7 +154,7 @@ def sample_ne(self):
         return self.negative[random.randint(0, len(self.negative) - 1)]
 
     def __iter__(self):
-        print 'begin'
+        print('begin')
         batch_data = []
         batch_label = []
         batch_label_weight = []
diff --git a/example/nce-loss/wordvec.py b/example/nce-loss/wordvec.py
index f68f8a4b3d6f..24b78305210d 100644
--- a/example/nce-loss/wordvec.py
+++ b/example/nce-loss/wordvec.py
@@ -1,4 +1,5 @@
 # pylint:skip-file
+from __future__ import print_function
 import logging
 import sys, random, time, math
 sys.path.insert(0, "../../python")
@@ -76,7 +77,7 @@ def __init__(self, name, batch_size, num_label):
         self.batch_size = batch_size
         self.data, self.negative, self.vocab, self.freq = load_data(name)
         self.vocab_size = 1 + len(self.vocab)
-        print self.vocab_size
+        print(self.vocab_size)
         self.num_label = num_label
         self.provide_data = [('data', (batch_size, num_label - 1))]
         self.provide_label = [('label', (self.batch_size, num_label)),
@@ -86,7 +87,7 @@ def sample_ne(self):
         return self.negative[random.randint(0, len(self.negative) - 1)]
 
     def __iter__(self):
-        print 'begin'
+        print('begin')
         batch_data = []
         batch_label = []
         batch_label_weight = []
diff --git a/example/neural-style/end_to_end/data_processing.py b/example/neural-style/end_to_end/data_processing.py
index 5469fb008d7a..80f1bcd5cfcd 100644
--- a/example/neural-style/end_to_end/data_processing.py
+++ b/example/neural-style/end_to_end/data_processing.py
@@ -13,7 +13,7 @@ def PreprocessContentImage(path, short_edge, dshape=None):
     new_size = (int(img.shape[0] * factor), int(img.shape[1] * factor))
     resized_img = transform.resize(img, new_size)
     sample = np.asarray(resized_img) * 256
-    if dshape != None:
+    if dshape is not None:
         # random crop
         xx = int((sample.shape[0] - dshape[2]))
         yy = int((sample.shape[1] - dshape[3]))
diff --git a/example/numpy-ops/custom_softmax.py b/example/numpy-ops/custom_softmax.py
index 3f7d0570bfaf..cbd9a027d7a6 100644
--- a/example/numpy-ops/custom_softmax.py
+++ b/example/numpy-ops/custom_softmax.py
@@ -1,7 +1,5 @@
 # pylint: skip-file
 import os
-# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-os.environ["MXNET_CPU_WORKER_NTHREADS"] = "2"
 from data import mnist_iterator
 import mxnet as mx
 import numpy as np
@@ -38,6 +36,9 @@ def infer_shape(self, in_shape):
         output_shape = in_shape[0]
         return [data_shape, label_shape], [output_shape], []
 
+    def infer_type(self, in_type):
+        return in_type, [in_type[0]], []
+
     def create_operator(self, ctx, shapes, dtypes):
         return Softmax()
 
diff --git a/example/numpy-ops/weighted_logistic_regression.py b/example/numpy-ops/weighted_logistic_regression.py
index 9be1790cb6f7..7094b3aca969 100644
--- a/example/numpy-ops/weighted_logistic_regression.py
+++ b/example/numpy-ops/weighted_logistic_regression.py
@@ -1,8 +1,6 @@
 import os
 import numpy as np
 import mxnet as mx
-# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-os.environ["MXNET_CPU_WORKER_NTHREADS"] = "2"
 
 class WeightedLogisticRegression(mx.operator.CustomOp):
     def __init__(self, pos_grad_scale, neg_grad_scale):
diff --git a/example/profiler/profiler_imageiter.py b/example/profiler/profiler_imageiter.py
index a4f406713050..af4c5d11aee8 100644
--- a/example/profiler/profiler_imageiter.py
+++ b/example/profiler/profiler_imageiter.py
@@ -1,6 +1,7 @@
 import os
 # uncomment to set the number of worker threads.
 # os.environ["MXNET_CPU_WORKER_NTHREADS"] = "4"
+from __future__ import print_function
 import time
 import mxnet as mx
 import numpy as np
@@ -19,7 +20,7 @@ def run_imageiter(path_rec, n, batch_size = 32):
     for i in range(n):
         data.next()
     mx.nd.waitall()
-    print batch_size*n/(time.time() - tic)
+    print(batch_size*n/(time.time() - tic))
 
 if __name__ == '__main__':
     mx.profiler.profiler_set_config(mode='all', filename='profile_imageiter.json')
diff --git a/example/profiler/profiler_matmul.py b/example/profiler/profiler_matmul.py
index b25877a55565..baa962307461 100644
--- a/example/profiler/profiler_matmul.py
+++ b/example/profiler/profiler_matmul.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import mxnet as mx
 import argparse
 import os, sys
@@ -32,7 +33,7 @@ def parse_args():
     b.copyto(executor.arg_dict['B'])
 
     flag = False
-    print "execution begin"
+    print("execution begin")
     for i in range(args.iter_num):
         if i == args.begin_profiling_iter:
             t0 = time.clock()
@@ -43,7 +44,7 @@ def parse_args():
         executor.forward()
         c = executor.outputs[0]
         c.wait_to_read()
-    print "execution end"
+    print("execution end")
     duration = t1 - t0
     print('duration: {0}s'.format(duration))
     print('          {0}ms/operator'.format(duration*1000/args.iter_num))
diff --git a/example/python-howto/README.md b/example/python-howto/README.md
index 543dc6ff5a2b..5007e858be4c 100644
--- a/example/python-howto/README.md
+++ b/example/python-howto/README.md
@@ -2,3 +2,34 @@ Python Howto Examples
 =====================
 * [Configuring Net to get Multiple Ouputs](multiple_outputs.py)
 * [Configuring Image Record Iterator](data_iter.py)
+* Set break point in C++ code of the symbol using gdb under Linux:
+
+	* 	Build mxnet with following values:
+
+		 ```
+		 	DEBUG=1 
+		 	CUDA=0 #to make sure convolution-inl.h will be used 
+		 	CUDNN=0 #to make sure convolution-inl.h will be used 
+		 ```
+		 
+	*  run python under gdb:  ```gdb --args python debug_conv.py```
+	*  in gdb set break point on particular line of the code and run execution: 
+
+		```
+(gdb) break src/operator/convolution-inl.h:120
+(gdb) run
+Breakpoint 1, mxnet::op::ConvolutionOp<mshadow::cpu, float>::Forward (this=0x12219d0, ctx=..., in_data=std::vector of length 3, capacity 4 = {...}, req=std::vector of length 1, capacity 1 = {...}, out_data=std::vector of length 1, capacity 1 = {...},
+    aux_args=std::vector of length 0, capacity 0) at src/operator/./convolution-inl.h:121
+121	               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+(gdb) list
+116	    }
+117	    Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
+118	    Shape<3> wmat_shape =
+119	        Shape3(param_.num_group,
+120	               param_.num_filter / param_.num_group,
+121	               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+122	    Tensor<xpu, 3, DType> wmat =
+123	        in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+124	    Tensor<xpu, 4, DType> out = out_data[conv::kOut].get<xpu, 4, DType>(s);
+125	#if defined(__CUDACC__)
+		```
\ No newline at end of file
diff --git a/example/python-howto/debug_conv.py b/example/python-howto/debug_conv.py
new file mode 100644
index 000000000000..3c38d20b89fa
--- /dev/null
+++ b/example/python-howto/debug_conv.py
@@ -0,0 +1,22 @@
+import mxnet as mx
+
+data_shape = (1,3,5,5)
+class SimpleData(object):
+
+    def __init__(self, data):
+        self.data = data
+
+data = mx.sym.Variable('data')
+conv = mx.sym.Convolution(data=data, kernel=(3,3), pad=(1,1), stride=(1,1), num_filter=1)
+mon = mx.mon.Monitor(1)
+
+
+mod = mx.mod.Module(conv)
+mod.bind(data_shapes=[('data', data_shape)])
+mod._exec_group.install_monitor(mon)
+mod.init_params()
+
+input_data = mx.nd.ones(data_shape)
+mod.forward(data_batch=SimpleData([input_data]))
+res = mod.get_outputs()[0].asnumpy()
+print(res)
\ No newline at end of file
diff --git a/example/rcnn/README.md b/example/rcnn/README.md
index 9f344e28213f..43cd054cb876 100644
--- a/example/rcnn/README.md
+++ b/example/rcnn/README.md
@@ -58,9 +58,6 @@ See if `bash script/additional_deps.sh` will do the following for you.
 * Suppose `HOME` represents where this file is located. All commands, unless stated otherwise, should be started from `HOME`.
   Executing scripts in `script` must also be from `HOME`.
 * Install python package `cython easydict matplotlib scikit-image`.
-* Install MXNet with additional operators.
-If you know Makefile, change `EXTRA_OPERATORS` in `config.mk` to include the `example/rcnn/operator` folder.  
-If not, copy all files in `operator` to `../../src/operator` (namely `mxnet/src/operator`).  
 * Install MXNet Python Interface. Open `python` type `import mxnet` to confirm.
 * Run `make` in `HOME`.
 
diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py
index 49d6d8fd6cb2..6fd53cf412fb 100644
--- a/example/rcnn/demo.py
+++ b/example/rcnn/demo.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import os
 import cv2
@@ -103,17 +104,17 @@ def demo_net(predictor, image_name, vis=False):
     boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))]
 
     # print results
-    print 'class ---- [[x1, x2, y1, y2, confidence]]'
+    print('class ---- [[x1, x2, y1, y2, confidence]]')
     for ind, boxes in enumerate(boxes_this_image):
         if len(boxes) > 0:
-            print '---------', CLASSES[ind], '---------'
-            print boxes
+            print('---------', CLASSES[ind], '---------')
+            print(boxes)
 
     if vis:
         vis_all_detection(data_dict['data'].asnumpy(), boxes_this_image, CLASSES, im_scale)
     else:
         result_file = image_name.replace('.', '_result.')
-        print 'results saved to %s' % result_file
+        print('results saved to %s' % result_file)
         im = draw_all_detection(data_dict['data'].asnumpy(), boxes_this_image, CLASSES, im_scale)
         cv2.imwrite(result_file, im)
 
diff --git a/example/rcnn/rcnn/core/tester.py b/example/rcnn/rcnn/core/tester.py
index 935749cd5694..a99614b370b5 100644
--- a/example/rcnn/rcnn/core/tester.py
+++ b/example/rcnn/rcnn/core/tester.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import cPickle
 import os
 import time
@@ -78,8 +79,9 @@ def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.):
         if vis:
             vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'], scale)
 
-        print 'generating %d/%d' % (i + 1, imdb.num_images), 'proposal %d' % (dets.shape[0]), \
-            'data %.4fs net %.4fs' % (t1, t2)
+        print('generating %d/%d' % (i + 1, imdb.num_images),
+              'proposal %d' % (dets.shape[0]),
+              'data %.4fs net %.4fs' % (t1, t2))
         i += 1
 
     assert len(imdb_boxes) == imdb.num_images, 'calculations not complete'
@@ -98,7 +100,7 @@ def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.):
         with open(full_rpn_file, 'wb') as f:
             cPickle.dump(original_boxes, f, cPickle.HIGHEST_PROTOCOL)
 
-    print 'wrote rpn proposals to {}'.format(rpn_file)
+    print('wrote rpn proposals to {}'.format(rpn_file))
     return imdb_boxes
 
 
@@ -187,7 +189,7 @@ def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3):
 
         t3 = time.time() - t
         t = time.time()
-        print 'testing {}/{} data {:.4f}s net {:.4f}s post {:.4f}s'.format(i, imdb.num_images, t1, t2, t3)
+        print('testing {}/{} data {:.4f}s net {:.4f}s post {:.4f}s'.format(i, imdb.num_images, t1, t2, t3))
         i += 1
 
     det_file = os.path.join(imdb.cache_path, imdb.name + '_detections.pkl')
diff --git a/example/rcnn/rcnn/dataset/coco.py b/example/rcnn/rcnn/dataset/coco.py
index 1fa8a4f40726..005d74e40608 100644
--- a/example/rcnn/rcnn/dataset/coco.py
+++ b/example/rcnn/rcnn/dataset/coco.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import cPickle
 import cv2
 import os
@@ -37,7 +38,7 @@ def __init__(self, image_set, root_path, data_path):
         # load image file names
         self.image_set_index = self._load_image_set_index()
         self.num_images = len(self.image_set_index)
-        print 'num_images', self.num_images
+        print('num_images', self.num_images)
 
         # deal with data name
         view_map = {'minival2014': 'val2014',
@@ -67,13 +68,13 @@ def gt_roidb(self):
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
                 roidb = cPickle.load(fid)
-            print '{} gt roidb loaded from {}'.format(self.name, cache_file)
+            print('{} gt roidb loaded from {}'.format(self.name, cache_file))
             return roidb
 
         gt_roidb = [self._load_coco_annotation(index) for index in self.image_set_index]
         with open(cache_file, 'wb') as fid:
             cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
-        print 'wrote gt roidb to {}'.format(cache_file)
+        print('wrote gt roidb to {}'.format(cache_file))
 
         return gt_roidb
 
@@ -154,10 +155,10 @@ def _write_coco_results(self, detections, res_file):
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
-            print 'Collecting %s results (%d/%d)' % (cls, cls_ind, self.num_classes - 1)
+            print('Collecting %s results (%d/%d)' % (cls, cls_ind, self.num_classes - 1))
             coco_cat_id = self._class_to_coco_ind[cls]
             results.extend(self._coco_results_one_category(detections[cls_ind], coco_cat_id))
-        print 'Writing results json to %s' % res_file
+        print('Writing results json to %s' % res_file)
         with open(res_file, 'w') as f:
             json.dump(results, f, sort_keys=True, indent=4)
 
@@ -191,7 +192,7 @@ def _do_python_eval(self, res_file, res_folder):
         eval_file = os.path.join(res_folder, 'detections_%s_results.pkl' % self.image_set)
         with open(eval_file, 'w') as f:
             cPickle.dump(coco_eval, f, cPickle.HIGHEST_PROTOCOL)
-        print 'coco eval results saved to %s' % eval_file
+        print('coco eval results saved to %s' % eval_file)
 
     def _print_detection_metrics(self, coco_eval):
         IoU_lo_thresh = 0.5
@@ -213,15 +214,15 @@ def _get_thr_ind(coco_eval, thr):
         precision = \
             coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2]
         ap_default = np.mean(precision[precision > -1])
-        print '~~~~ Mean and per-category AP @ IoU=%.2f,%.2f] ~~~~' % (IoU_lo_thresh, IoU_hi_thresh)
-        print '%-15s %5.1f' % ('all', 100 * ap_default)
+        print('~~~~ Mean and per-category AP @ IoU=%.2f,%.2f] ~~~~' % (IoU_lo_thresh, IoU_hi_thresh))
+        print('%-15s %5.1f' % ('all', 100 * ap_default))
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
             # minus 1 because of __background__
             precision = coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, cls_ind - 1, 0, 2]
             ap = np.mean(precision[precision > -1])
-            print '%-15s %5.1f' % (cls, 100 * ap)
+            print('%-15s %5.1f' % (cls, 100 * ap))
 
-        print '~~~~ Summary metrics ~~~~'
+        print('~~~~ Summary metrics ~~~~')
         coco_eval.summarize()
diff --git a/example/rcnn/rcnn/dataset/imdb.py b/example/rcnn/rcnn/dataset/imdb.py
index 680f9e2365ca..1ad18dbc29bc 100644
--- a/example/rcnn/rcnn/dataset/imdb.py
+++ b/example/rcnn/rcnn/dataset/imdb.py
@@ -9,6 +9,7 @@
 'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets']
 """
 
+from __future__ import print_function
 import os
 import cPickle
 import numpy as np
@@ -69,7 +70,7 @@ def load_rpn_data(self, full=False):
             rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_full_rpn.pkl')
         else:
             rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_rpn.pkl')
-        print 'loading {}'.format(rpn_file)
+        print('loading {}'.format(rpn_file))
         assert os.path.exists(rpn_file), 'rpn data not found at {}'.format(rpn_file)
         with open(rpn_file, 'rb') as f:
             box_list = cPickle.load(f)
@@ -92,7 +93,7 @@ def rpn_roidb(self, gt_roidb, append_gt=False):
         :return: roidb of rpn
         """
         if append_gt:
-            print 'appending ground truth annotations'
+            print('appending ground truth annotations')
             rpn_roidb = self.load_rpn_roidb(gt_roidb)
             roidb = IMDB.merge_roidbs(gt_roidb, rpn_roidb)
         else:
@@ -155,7 +156,7 @@ def append_flipped_images(self, roidb):
         :param roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped']
         """
-        print 'append flipped images to roidb'
+        print('append flipped images to roidb')
         assert self.num_images == len(roidb)
         for i in range(self.num_images):
             roi_rec = roidb[i]
@@ -210,8 +211,8 @@ def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None):
             area_counts.append(area_count)
         total_counts = float(sum(area_counts))
         for area_name, area_count in zip(area_names[1:], area_counts):
-            print 'percentage of', area_name, area_count / total_counts
-        print 'average number of proposal', total_counts / self.num_images
+            print('percentage of', area_name, area_count / total_counts)
+        print('average number of proposal', total_counts / self.num_images)
         for area_name, area_range in zip(area_names, area_ranges):
             gt_overlaps = np.zeros(0)
             num_pos = 0
@@ -271,9 +272,9 @@ def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None):
             ar = recalls.mean()
 
             # print results
-            print 'average recall for {}: {:.3f}'.format(area_name, ar)
+            print('average recall for {}: {:.3f}'.format(area_name, ar))
             for threshold, recall in zip(thresholds, recalls):
-                print 'recall @{:.2f}: {:.3f}'.format(threshold, recall)
+                print('recall @{:.2f}: {:.3f}'.format(threshold, recall))
 
     @staticmethod
     def merge_roidbs(a, b):
diff --git a/example/rcnn/rcnn/dataset/pascal_voc.py b/example/rcnn/rcnn/dataset/pascal_voc.py
index 70d43695baac..268399316162 100644
--- a/example/rcnn/rcnn/dataset/pascal_voc.py
+++ b/example/rcnn/rcnn/dataset/pascal_voc.py
@@ -6,6 +6,7 @@
 criterion.
 """
 
+from __future__ import print_function
 import cPickle
 import cv2
 import os
@@ -41,7 +42,7 @@ def __init__(self, image_set, root_path, devkit_path):
         self.num_classes = len(self.classes)
         self.image_set_index = self.load_image_set_index()
         self.num_images = len(self.image_set_index)
-        print 'num_images', self.num_images
+        print('num_images', self.num_images)
 
         self.config = {'comp_id': 'comp4',
                        'use_diff': False,
@@ -77,13 +78,13 @@ def gt_roidb(self):
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
                 roidb = cPickle.load(fid)
-            print '{} gt roidb loaded from {}'.format(self.name, cache_file)
+            print('{} gt roidb loaded from {}'.format(self.name, cache_file))
             return roidb
 
         gt_roidb = [self.load_pascal_annotation(index) for index in self.image_set_index]
         with open(cache_file, 'wb') as fid:
             cPickle.dump(gt_roidb, fid, cPickle.HIGHEST_PROTOCOL)
-        print 'wrote gt roidb to {}'.format(cache_file)
+        print('wrote gt roidb to {}'.format(cache_file))
 
         return gt_roidb
 
@@ -167,18 +168,18 @@ def selective_search_roidb(self, gt_roidb, append_gt=False):
         if os.path.exists(cache_file):
             with open(cache_file, 'rb') as fid:
                 roidb = cPickle.load(fid)
-            print '{} ss roidb loaded from {}'.format(self.name, cache_file)
+            print('{} ss roidb loaded from {}'.format(self.name, cache_file))
             return roidb
 
         if append_gt:
-            print 'appending ground truth annotations'
+            print('appending ground truth annotations')
             ss_roidb = self.load_selective_search_roidb(gt_roidb)
             roidb = IMDB.merge_roidbs(gt_roidb, ss_roidb)
         else:
             roidb = self.load_selective_search_roidb(gt_roidb)
         with open(cache_file, 'wb') as fid:
             cPickle.dump(roidb, fid, cPickle.HIGHEST_PROTOCOL)
-        print 'wrote ss roidb to {}'.format(cache_file)
+        print('wrote ss roidb to {}'.format(cache_file))
 
         return roidb
 
@@ -223,7 +224,7 @@ def write_pascal_results(self, all_boxes):
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
-            print 'Writing {} VOC results file'.format(cls)
+            print('Writing {} VOC results file'.format(cls))
             filename = self.get_result_file_template().format(cls)
             with open(filename, 'wt') as f:
                 for im_ind, index in enumerate(self.image_set_index):
@@ -247,7 +248,7 @@ def do_python_eval(self):
         aps = []
         # The PASCAL VOC metric changed in 2010
         use_07_metric = True if int(self.year) < 2010 else False
-        print 'VOC07 metric? ' + ('Y' if use_07_metric else 'No')
+        print('VOC07 metric? ' + ('Y' if use_07_metric else 'No'))
         for cls_ind, cls in enumerate(self.classes):
             if cls == '__background__':
                 continue
diff --git a/example/rcnn/rcnn/dataset/pascal_voc_eval.py b/example/rcnn/rcnn/dataset/pascal_voc_eval.py
index 25a990371aad..67771e4830e5 100644
--- a/example/rcnn/rcnn/dataset/pascal_voc_eval.py
+++ b/example/rcnn/rcnn/dataset/pascal_voc_eval.py
@@ -2,6 +2,7 @@
 given a pascal voc imdb, compute mAP
 """
 
+from __future__ import print_function
 import numpy as np
 import os
 import cPickle
@@ -85,8 +86,8 @@ def voc_eval(detpath, annopath, imageset_file, classname, annocache, ovthresh=0.
         for ind, image_filename in enumerate(image_filenames):
             recs[image_filename] = parse_voc_rec(annopath.format(image_filename))
             if ind % 100 == 0:
-                print 'reading annotations for {:d}/{:d}'.format(ind + 1, len(image_filenames))
-        print 'saving annotations cache to {:s}'.format(annocache)
+                print('reading annotations for {:d}/{:d}'.format(ind + 1, len(image_filenames)))
+        print('saving annotations cache to {:s}'.format(annocache))
         with open(annocache, 'w') as f:
             cPickle.dump(recs, f, protocol=cPickle.HIGHEST_PROTOCOL)
     else:
diff --git a/example/rcnn/rcnn/io/rpn.py b/example/rcnn/rcnn/io/rpn.py
index 3eb6bb445d8c..c813e4ab06f6 100644
--- a/example/rcnn/rcnn/io/rpn.py
+++ b/example/rcnn/rcnn/io/rpn.py
@@ -10,6 +10,7 @@
      'bbox_weight': [batch_size, num_anchors, feat_height, feat_width]}
 """
 
+from __future__ import print_function
 import numpy as np
 import numpy.random as npr
 
@@ -101,15 +102,15 @@ def _unmap(data, count, inds, fill=0):
     feat_height, feat_width = feat_shape[-2:]
 
     if DEBUG:
-        print 'anchors:'
-        print base_anchors
-        print 'anchor shapes:'
-        print np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4],
-                         base_anchors[:, 3::4] - base_anchors[:, 1::4]))
-        print 'im_info', im_info
-        print 'height', feat_height, 'width', feat_width
-        print 'gt_boxes shape', gt_boxes.shape
-        print 'gt_boxes', gt_boxes
+        print('anchors:')
+        print(base_anchors)
+        print('anchor shapes:')
+        print(np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4],
+                         base_anchors[:, 3::4] - base_anchors[:, 1::4])))
+        print('im_info', im_info)
+        print('height', feat_height, 'width', feat_width)
+        print('gt_boxes shape', gt_boxes.shape)
+        print('gt_boxes', gt_boxes)
 
     # 1. generate proposals from bbox deltas and shifted anchors
     shift_x = np.arange(0, feat_width) * feat_stride
@@ -132,13 +133,13 @@ def _unmap(data, count, inds, fill=0):
                            (all_anchors[:, 2] < im_info[1] + allowed_border) &
                            (all_anchors[:, 3] < im_info[0] + allowed_border))[0]
     if DEBUG:
-        print 'total_anchors', total_anchors
-        print 'inds_inside', len(inds_inside)
+        print('total_anchors', total_anchors)
+        print('inds_inside', len(inds_inside))
 
     # keep only inside anchors
     anchors = all_anchors[inds_inside, :]
     if DEBUG:
-        print 'anchors shape', anchors.shape
+        print('anchors shape', anchors.shape)
 
     # label: 1 is positive, 0 is negative, -1 is dont care
     labels = np.empty((len(inds_inside),), dtype=np.float32)
@@ -201,8 +202,8 @@ def _unmap(data, count, inds, fill=0):
         _counts = np.sum(labels == 1)
         means = _sums / (_counts + 1e-14)
         stds = np.sqrt(_squared_sums / _counts - means ** 2)
-        print 'means', means
-        print 'stdevs', stds
+        print('means', means)
+        print('stdevs', stds)
 
     # map up to original set of anchors
     labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
@@ -210,14 +211,14 @@ def _unmap(data, count, inds, fill=0):
     bbox_weights = _unmap(bbox_weights, total_anchors, inds_inside, fill=0)
 
     if DEBUG:
-        print 'rpn: max max_overlaps', np.max(max_overlaps)
-        print 'rpn: num_positives', np.sum(labels == 1)
-        print 'rpn: num_negatives', np.sum(labels == 0)
+        print('rpn: max max_overlaps', np.max(max_overlaps))
+        print('rpn: num_positives', np.sum(labels == 1))
+        print('rpn: num_negatives', np.sum(labels == 0))
         _fg_sum = np.sum(labels == 1)
         _bg_sum = np.sum(labels == 0)
         _count = 1
-        print 'rpn: num_positive avg', _fg_sum / _count
-        print 'rpn: num_negative avg', _bg_sum / _count
+        print('rpn: num_positive avg', _fg_sum / _count)
+        print('rpn: num_negative avg', _bg_sum / _count)
 
     labels = labels.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2)
     labels = labels.reshape((1, A * feat_height * feat_width))
diff --git a/example/rcnn/rcnn/processing/bbox_regression.py b/example/rcnn/rcnn/processing/bbox_regression.py
index 8f3619b018b3..46969aa0ec5e 100644
--- a/example/rcnn/rcnn/processing/bbox_regression.py
+++ b/example/rcnn/rcnn/processing/bbox_regression.py
@@ -2,6 +2,7 @@
 This file has functions about generating bounding box regression targets
 """
 
+from __future__ import print_function
 import numpy as np
 
 from bbox_transform import bbox_overlaps, bbox_transform
@@ -21,12 +22,12 @@ def compute_bbox_regression_targets(rois, overlaps, labels):
 
     # Sanity check
     if len(rois) != len(overlaps):
-        print 'bbox regression: this should not happen'
+        print('bbox regression: this should not happen')
 
     # Indices of ground-truth ROIs
     gt_inds = np.where(overlaps == 1)[0]
     if len(gt_inds) == 0:
-        print 'something wrong : zero ground truth rois'
+        print('something wrong : zero ground truth rois')
     # Indices of examples for which we try to make predictions
     ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0]
 
@@ -51,7 +52,7 @@ def add_bbox_regression_targets(roidb):
     :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
     :return: means, std variances of targets
     """
-    print 'add bounding box regression targets'
+    print('add bounding box regression targets')
     assert len(roidb) > 0
     assert 'max_classes' in roidb[0]
 
diff --git a/example/rcnn/rcnn/processing/roidb.py b/example/rcnn/rcnn/processing/roidb.py
index 7ad1b26c182f..8dddc27f60c9 100644
--- a/example/rcnn/rcnn/processing/roidb.py
+++ b/example/rcnn/rcnn/processing/roidb.py
@@ -4,6 +4,7 @@
 extended ['image', 'max_classes', 'max_overlaps', 'bbox_targets']
 """
 
+from __future__ import print_function
 import cv2
 import numpy as np
 
@@ -18,7 +19,7 @@ def prepare_roidb(imdb, roidb):
     :param roidb: roidb
     :return: None
     """
-    print 'prepare roidb'
+    print('prepare roidb')
     for i in range(len(roidb)):  # image_index
         roidb[i]['image'] = imdb.image_path_from_index(imdb.image_set_index[i])
         if config.TRAIN.ASPECT_GROUPING:
@@ -45,7 +46,7 @@ def add_bbox_regression_targets(roidb):
     :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb
     :return: means, std variances of targets
     """
-    print 'add bounding box regression targets'
+    print('add bounding box regression targets')
     assert len(roidb) > 0
     assert 'max_classes' in roidb[0]
 
diff --git a/example/rcnn/rcnn/pycocotools/coco.py b/example/rcnn/rcnn/pycocotools/coco.py
index 12cb559beb4b..44158d21d5a4 100644
--- a/example/rcnn/rcnn/pycocotools/coco.py
+++ b/example/rcnn/rcnn/pycocotools/coco.py
@@ -44,6 +44,7 @@
 # Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
 # Licensed under the Simplified BSD License [see bsd.txt]
 
+from __future__ import print_function
 import json
 import datetime
 import time
@@ -73,17 +74,17 @@ def __init__(self, annotation_file=None):
         self.catToImgs = {}
         self.imgs = {}
         self.cats = {}
-        if not annotation_file == None:
-            print 'loading annotations into memory...'
+        if annotation_file is not None:
+            print('loading annotations into memory...')
             tic = time.time()
             dataset = json.load(open(annotation_file, 'r'))
-            print 'Done (t=%0.2fs)'%(time.time()- tic)
+            print('Done (t=%0.2fs)'%(time.time()- tic))
             self.dataset = dataset
             self.createIndex()
 
     def createIndex(self):
         # create index
-        print 'creating index...'
+        print('creating index...')
         anns = {}
         imgToAnns = {}
         catToImgs = {}
@@ -109,7 +110,7 @@ def createIndex(self):
             for ann in self.dataset['annotations']:
                 catToImgs[ann['category_id']] += [ann['image_id']]
 
-        print 'index created!'
+        print('index created!')
 
         # create class members
         self.anns = anns
@@ -124,7 +125,7 @@ def info(self):
         :return:
         """
         for key, value in self.dataset['info'].items():
-            print '%s: %s'%(key, value)
+            print('%s: %s'%(key, value))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -149,7 +150,7 @@ def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
                 anns = self.dataset['annotations']
             anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
             anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if not iscrowd == None:
+        if iscrowd is not None:
             ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
         else:
             ids = [ann['id'] for ann in anns]
@@ -275,7 +276,7 @@ def showAnns(self, anns):
             ax.add_collection(p)
         elif datasetType == 'captions':
             for ann in anns:
-                print ann['caption']
+                print(ann['caption'])
 
     def loadRes(self, resFile):
         """
@@ -288,7 +289,7 @@ def loadRes(self, resFile):
         # res.dataset['info'] = copy.deepcopy(self.dataset['info'])
         # res.dataset['licenses'] = copy.deepcopy(self.dataset['licenses'])
 
-        print 'Loading and preparing results...     '
+        print('Loading and preparing results...     ')
         tic = time.time()
         anns    = json.load(open(resFile))
         assert type(anns) == list, 'results in not an array of objects'
@@ -319,13 +320,13 @@ def loadRes(self, resFile):
                     ann['bbox'] = mask.toBbox([ann['segmentation']])[0]
                 ann['id'] = id+1
                 ann['iscrowd'] = 0
-        print 'DONE (t=%0.2fs)'%(time.time()- tic)
+        print('DONE (t=%0.2fs)'%(time.time()- tic))
 
         res.dataset['annotations'] = anns
         res.createIndex()
         return res
 
-    def download( self, tarDir = None, imgIds = [] ):
+    def download(self, tarDir=None, imgIds=[]):
         '''
         Download COCO images from mscoco.org server.
         :param tarDir (str): COCO results directory name
@@ -333,7 +334,7 @@ def download( self, tarDir = None, imgIds = [] ):
         :return:
         '''
         if tarDir is None:
-            print 'Please specify target directory'
+            print('Please specify target directory')
             return -1
         if len(imgIds) == 0:
             imgs = self.imgs.values()
@@ -347,4 +348,4 @@ def download( self, tarDir = None, imgIds = [] ):
             fname = os.path.join(tarDir, img['file_name'])
             if not os.path.exists(fname):
                 urllib.urlretrieve(img['coco_url'], fname)
-            print 'downloaded %d/%d images (t=%.1fs)'%(i, N, time.time()- tic)
+            print('downloaded %d/%d images (t=%.1fs)'%(i, N, time.time()- tic))
diff --git a/example/rcnn/rcnn/pycocotools/cocoeval.py b/example/rcnn/rcnn/pycocotools/cocoeval.py
index f389eb0f2e71..015c9f4ff8cc 100644
--- a/example/rcnn/rcnn/pycocotools/cocoeval.py
+++ b/example/rcnn/rcnn/pycocotools/cocoeval.py
@@ -1,5 +1,6 @@
 __author__ = 'tsungyi'
 
+from __future__ import print_function
 import numpy as np
 import datetime
 import time
@@ -91,7 +92,7 @@ def _toMask(objs, coco):
                 t = coco.imgs[obj['image_id']]
                 if type(obj['segmentation']) == list:
                     if type(obj['segmentation'][0]) == dict:
-                        print 'debug'
+                        print('debug')
                     obj['segmentation'] = mask.frPyObjects(obj['segmentation'],t['height'],t['width'])
                     if len(obj['segmentation']) == 1:
                         obj['segmentation'] = obj['segmentation'][0]
@@ -132,7 +133,7 @@ def evaluate(self):
         :return: None
         '''
         tic = time.time()
-        print 'Running per image evaluation...      '
+        print('Running per image evaluation...      ')
         p = self.params
         p.imgIds = list(np.unique(p.imgIds))
         if p.useCats:
@@ -158,7 +159,7 @@ def evaluate(self):
              ]
         self._paramsEval = copy.deepcopy(self.params)
         toc = time.time()
-        print 'DONE (t=%0.2fs).'%(toc-tic)
+        print('DONE (t=%0.2fs).'%(toc-tic))
 
     def computeIoU(self, imgId, catId):
         p = self.params
@@ -277,10 +278,10 @@ def accumulate(self, p = None):
         :param p: input params for evaluation
         :return: None
         '''
-        print 'Accumulating evaluation results...   '
+        print('Accumulating evaluation results...   ')
         tic = time.time()
         if not self.evalImgs:
-            print 'Please run evaluate() first'
+            print('Please run evaluate() first')
         # allows input customized parameters
         if p is None:
             p = self.params
@@ -371,7 +372,7 @@ def accumulate(self, p = None):
             'recall':   recall,
         }
         toc = time.time()
-        print 'DONE (t=%0.2fs).'%( toc-tic )
+        print('DONE (t=%0.2fs).'%( toc-tic ))
 
     def summarize(self):
         '''
@@ -406,7 +407,7 @@ def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
                 mean_s = -1
             else:
                 mean_s = np.mean(s[s>-1])
-            print iStr.format(titleStr, typeStr, iouStr, areaStr, maxDetsStr, '%.3f'%(float(mean_s)))
+            print(iStr.format(titleStr, typeStr, iouStr, areaStr, maxDetsStr, '%.3f'%(float(mean_s))))
             return mean_s
 
         if not self.eval:
diff --git a/example/rcnn/rcnn/symbol/proposal.py b/example/rcnn/rcnn/symbol/proposal.py
index 5b77da4c4220..397030db6d7c 100644
--- a/example/rcnn/rcnn/symbol/proposal.py
+++ b/example/rcnn/rcnn/symbol/proposal.py
@@ -3,6 +3,7 @@
 classification probability and bounding box prediction results, and image size and scale information.
 """
 
+from __future__ import print_function
 import mxnet as mx
 import numpy as np
 import numpy.random as npr
@@ -31,9 +32,9 @@ def __init__(self, feat_stride, scales, ratios, output_score,
         self._rpn_min_size = rpn_min_size
 
         if DEBUG:
-            print 'feat_stride: {}'.format(self._feat_stride)
-            print 'anchors:'
-            print self._anchors
+            print('feat_stride: {}'.format(self._feat_stride))
+            print('anchors:')
+            print(self._anchors)
 
     def forward(self, is_train, req, in_data, out_data, aux):
         nms = gpu_nms_wrapper(self._threshold, in_data[0].context.device_id)
@@ -64,16 +65,16 @@ def forward(self, is_train, req, in_data, out_data, aux):
         im_info = in_data[2].asnumpy()[0, :]
 
         if DEBUG:
-            print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
-            print 'scale: {}'.format(im_info[2])
+            print('im_size: ({}, {})'.format(im_info[0], im_info[1]))
+            print('scale: {}'.format(im_info[2]))
 
         # 1. Generate proposals from bbox_deltas and shifted anchors
         # use real image size instead of padded feature map sizes
         height, width = int(im_info[0] / self._feat_stride), int(im_info[1] / self._feat_stride)
 
         if DEBUG:
-            print 'score map size: {}'.format(scores.shape)
-            print "resudial: {}".format((scores.shape[2] - height, scores.shape[3] - width))
+            print('score map size: {}'.format(scores.shape))
+            print("resudial: {}".format((scores.shape[2] - height, scores.shape[3] - width)))
 
         # Enumerate all shifts
         shift_x = np.arange(0, width) * self._feat_stride
diff --git a/example/rcnn/rcnn/symbol/proposal_target.py b/example/rcnn/rcnn/symbol/proposal_target.py
index 417ee559b381..3f28cb2cbebb 100644
--- a/example/rcnn/rcnn/symbol/proposal_target.py
+++ b/example/rcnn/rcnn/symbol/proposal_target.py
@@ -2,6 +2,7 @@
 Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them.
 """
 
+from __future__ import print_function
 import mxnet as mx
 import numpy as np
 from distutils.util import strtobool
@@ -43,16 +44,16 @@ def forward(self, is_train, req, in_data, out_data, aux):
             sample_rois(all_rois, fg_rois_per_image, rois_per_image, self._num_classes, gt_boxes=gt_boxes)
 
         if DEBUG:
-            print "labels=", labels
-            print 'num fg: {}'.format((labels > 0).sum())
-            print 'num bg: {}'.format((labels == 0).sum())
+            print("labels=", labels)
+            print('num fg: {}'.format((labels > 0).sum()))
+            print('num bg: {}'.format((labels == 0).sum()))
             self._count += 1
             self._fg_num += (labels > 0).sum()
             self._bg_num += (labels == 0).sum()
-            print "self._count=", self._count
-            print 'num fg avg: {}'.format(self._fg_num / self._count)
-            print 'num bg avg: {}'.format(self._bg_num / self._count)
-            print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num))
+            print("self._count=", self._count)
+            print('num fg avg: {}'.format(self._fg_num / self._count))
+            print('num bg avg: {}'.format(self._bg_num / self._count))
+            print('ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)))
 
         for ind, val in enumerate([rois, labels, bbox_targets, bbox_weights]):
             self.assign(out_data[ind], req[ind], val)
diff --git a/example/rcnn/rcnn/symbol/symbol_resnet.py b/example/rcnn/rcnn/symbol/symbol_resnet.py
index 8fc58fc336f0..48357a629a21 100644
--- a/example/rcnn/rcnn/symbol/symbol_resnet.py
+++ b/example/rcnn/rcnn/symbol/symbol_resnet.py
@@ -96,7 +96,7 @@ def get_resnet_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCH
     rpn_cls_act_reshape = mx.symbol.Reshape(
         data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape')
     if config.TRAIN.CXX_PROPOSAL:
-        rois = mx.symbol.Proposal(
+        rois = mx.contrib.symbol.Proposal(
             cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N,
@@ -172,7 +172,7 @@ def get_resnet_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHO
     rpn_cls_prob_reshape = mx.symbol.Reshape(
         data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
     if config.TEST.CXX_PROPOSAL:
-        rois = mx.symbol.Proposal(
+        rois = mx.contrib.symbol.Proposal(
             cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N,
diff --git a/example/rcnn/rcnn/symbol/symbol_vgg.py b/example/rcnn/rcnn/symbol/symbol_vgg.py
index 806ac8e23045..aa0bab68ed50 100644
--- a/example/rcnn/rcnn/symbol/symbol_vgg.py
+++ b/example/rcnn/rcnn/symbol/symbol_vgg.py
@@ -225,7 +225,7 @@ def get_vgg_rpn_test(num_anchors=config.NUM_ANCHORS):
     rpn_cls_prob_reshape = mx.symbol.Reshape(
         data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
     if config.TEST.CXX_PROPOSAL:
-        group = mx.symbol.Proposal(
+        group = mx.contrib.symbol.Proposal(
             cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', output_score=True,
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TEST.PROPOSAL_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.PROPOSAL_POST_NMS_TOP_N,
@@ -273,7 +273,7 @@ def get_vgg_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS)
     rpn_cls_prob_reshape = mx.symbol.Reshape(
         data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
     if config.TEST.CXX_PROPOSAL:
-        rois = mx.symbol.Proposal(
+        rois = mx.contrib.symbol.Proposal(
             cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N,
@@ -356,7 +356,7 @@ def get_vgg_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS
     rpn_cls_act_reshape = mx.symbol.Reshape(
         data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape')
     if config.TRAIN.CXX_PROPOSAL:
-        rois = mx.symbol.Proposal(
+        rois = mx.contrib.symbol.Proposal(
             cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N,
diff --git a/example/rcnn/rcnn/tools/reeval.py b/example/rcnn/rcnn/tools/reeval.py
index 87d00da7f846..a2e6264942de 100644
--- a/example/rcnn/rcnn/tools/reeval.py
+++ b/example/rcnn/rcnn/tools/reeval.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import cPickle
 import os
@@ -38,7 +39,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print 'Called with argument:', args
+    print('Called with argument:', args)
     reeval(args)
 
 
diff --git a/example/rcnn/rcnn/tools/test_rcnn.py b/example/rcnn/rcnn/tools/test_rcnn.py
index 1941fd1ab520..58cb9eb36c82 100644
--- a/example/rcnn/rcnn/tools/test_rcnn.py
+++ b/example/rcnn/rcnn/tools/test_rcnn.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import pprint
 import mxnet as mx
@@ -99,7 +100,7 @@ def parse_args():
 def main():
     args = parse_args()
     ctx = mx.gpu(args.gpu)
-    print args
+    print(args)
     test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               ctx, args.prefix, args.epoch,
               args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh)
diff --git a/example/rcnn/rcnn/tools/test_rpn.py b/example/rcnn/rcnn/tools/test_rpn.py
index 16203d2225f8..9d0ff198e1b4 100644
--- a/example/rcnn/rcnn/tools/test_rpn.py
+++ b/example/rcnn/rcnn/tools/test_rpn.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import pprint
 import mxnet as mx
@@ -88,7 +89,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print 'Called with argument:', args
+    print('Called with argument:', args)
     ctx = mx.gpu(args.gpu)
     test_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
              ctx, args.prefix, args.epoch,
diff --git a/example/rcnn/rcnn/tools/train_rcnn.py b/example/rcnn/rcnn/tools/train_rcnn.py
index aabe9c94086e..0669af047819 100644
--- a/example/rcnn/rcnn/tools/train_rcnn.py
+++ b/example/rcnn/rcnn/tools/train_rcnn.py
@@ -60,7 +60,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
     out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
     aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
-    print 'output shape'
+    print('output shape')
     pprint.pprint(out_shape_dict)
 
     # load and initialize params
@@ -115,7 +115,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
     lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
     lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
     lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff]
-    print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters
+    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
     lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
     # optimizer
     optimizer_params = {'momentum': 0.9,
@@ -166,7 +166,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print 'Called with argument:', args
+    print('Called with argument:', args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
                args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
diff --git a/example/rcnn/rcnn/tools/train_rpn.py b/example/rcnn/rcnn/tools/train_rpn.py
index 25758202f05b..2c7267ea36ef 100644
--- a/example/rcnn/rcnn/tools/train_rpn.py
+++ b/example/rcnn/rcnn/tools/train_rpn.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import logging
 import pprint
@@ -52,7 +53,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     # infer max shape
     max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
     max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
-    print 'providing maximum shape', max_data_shape, max_label_shape
+    print('providing maximum shape', max_data_shape, max_label_shape)
 
     # infer shape
     data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
@@ -60,7 +61,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
     out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
     aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
-    print 'output shape'
+    print('output shape')
     pprint.pprint(out_shape_dict)
 
     # load and initialize params
@@ -117,7 +118,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
     lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
     lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff]
-    print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters
+    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
     lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
     # optimizer
     optimizer_params = {'momentum': 0.9,
@@ -167,7 +168,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print 'Called with argument:', args
+    print('Called with argument:', args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume,
diff --git a/example/rcnn/rcnn/utils/caffe_convert.py b/example/rcnn/rcnn/utils/caffe_convert.py
index 4dfbfb4e186f..b5f0fbe27d14 100644
--- a/example/rcnn/rcnn/utils/caffe_convert.py
+++ b/example/rcnn/rcnn/utils/caffe_convert.py
@@ -1,5 +1,6 @@
 # This script will not work unless all paths are set right
 
+from __future__ import print_function
 import os
 import sys
 import mxnet as mx
@@ -32,19 +33,19 @@ def get_caffe_iter(layer_names, layers):
             wmat = np.array(layer_blobs[0].data).reshape(layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width)
             bias = np.array(layer_blobs[1].data)
             if first_conv:
-                print 'Swapping BGR of caffe into RGB in mxnet'
+                print('Swapping BGR of caffe into RGB in mxnet')
                 wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
 
             assert(wmat.flags['C_CONTIGUOUS'] is True)
             assert(bias.flags['C_CONTIGUOUS'] is True)
-            print 'converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape)
+            print('converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape))
             wmat = wmat.reshape((wmat.shape[0], -1))
             bias = bias.reshape((bias.shape[0], 1))
             weight_name = layer_name + "_weight"
             bias_name = layer_name + "_bias"
             
             if weight_name not in arg_shape_dic:
-                print weight_name + ' not found in arg_shape_dic.'
+                print(weight_name + ' not found in arg_shape_dic.')
                 continue
             wmat = wmat.reshape(arg_shape_dic[weight_name])
             arg_params[weight_name] = mx.nd.zeros(wmat.shape)
diff --git a/example/rcnn/rcnn/utils/load_data.py b/example/rcnn/rcnn/utils/load_data.py
index ef5927fe513f..d56882a5c9d8 100644
--- a/example/rcnn/rcnn/utils/load_data.py
+++ b/example/rcnn/rcnn/utils/load_data.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy as np
 from ..config import config
 from ..dataset import *
@@ -46,6 +47,6 @@ def is_valid(entry):
     num = len(roidb)
     filtered_roidb = [entry for entry in roidb if is_valid(entry)]
     num_after = len(filtered_roidb)
-    print 'filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after)
+    print('filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after))
 
     return filtered_roidb
diff --git a/example/rcnn/test.py b/example/rcnn/test.py
index 17652b73dd47..708efc8c7ddb 100644
--- a/example/rcnn/test.py
+++ b/example/rcnn/test.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import mxnet as mx
 from rcnn.config import config, default, generate_config
@@ -31,7 +32,7 @@ def parse_args():
 def main():
     args = parse_args()
     ctx = mx.gpu(args.gpu)
-    print args
+    print(args)
     test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path,
               ctx, args.prefix, args.epoch,
               args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh)
diff --git a/example/rcnn/train_alternate.py b/example/rcnn/train_alternate.py
index 3f30db03f914..991fb237d085 100644
--- a/example/rcnn/train_alternate.py
+++ b/example/rcnn/train_alternate.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import logging
 
@@ -100,7 +101,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print 'Called with argument:', args
+    print('Called with argument:', args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     alternate_train(args, ctx, args.pretrained, args.pretrained_epoch,
                     args.rpn_epoch, args.rpn_lr, args.rpn_lr_step,
diff --git a/example/rcnn/train_end2end.py b/example/rcnn/train_end2end.py
index 34a315c09ed7..ac00120131c9 100644
--- a/example/rcnn/train_end2end.py
+++ b/example/rcnn/train_end2end.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import argparse
 import logging
 import pprint
@@ -55,7 +56,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))]
     max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape)
     max_data_shape.append(('gt_boxes', (input_batch_size, 100, 5)))
-    print 'providing maximum shape', max_data_shape, max_label_shape
+    print('providing maximum shape', max_data_shape, max_label_shape)
 
     # infer shape
     data_shape_dict = dict(train_data.provide_data + train_data.provide_label)
@@ -63,7 +64,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape))
     out_shape_dict = dict(zip(sym.list_outputs(), out_shape))
     aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape))
-    print 'output shape'
+    print('output shape')
     pprint.pprint(out_shape_dict)
 
     # load and initialize params
@@ -126,7 +127,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch]
     lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff)))
     lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff]
-    print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters
+    print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters)
     lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor)
     # optimizer
     optimizer_params = {'momentum': 0.9,
@@ -175,7 +176,7 @@ def parse_args():
 
 def main():
     args = parse_args()
-    print 'Called with argument:', args
+    print('Called with argument:', args)
     ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
     train_net(args, ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch,
               lr=args.lr, lr_step=args.lr_step)
diff --git a/example/recommenders/crossentropy.py b/example/recommenders/crossentropy.py
index 77c6375b6cfd..79fee7439438 100644
--- a/example/recommenders/crossentropy.py
+++ b/example/recommenders/crossentropy.py
@@ -2,8 +2,6 @@
 """Cross-entropy loss layer for MXNet.
 """
 import os
-# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-os.environ["MXNET_CPU_WORKER_NTHREADS"] = "2"
 
 import numpy as np
 import mxnet as mx
diff --git a/example/recommenders/randomproj.py b/example/recommenders/randomproj.py
index 98b19c3223a2..539f50e0f647 100644
--- a/example/recommenders/randomproj.py
+++ b/example/recommenders/randomproj.py
@@ -2,9 +2,6 @@
 Currently slow and memory-inefficient, but functional.
 """
 import os
-# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-os.environ["MXNET_CPU_WORKER_NTHREADS"] = "2"
-
 import numpy as np
 import mxnet as mx
 
diff --git a/example/reinforcement-learning/a3c/a3c.py b/example/reinforcement-learning/a3c/a3c.py
index 2ce687ea9ae0..19ab2305fb5e 100644
--- a/example/reinforcement-learning/a3c/a3c.py
+++ b/example/reinforcement-learning/a3c/a3c.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import mxnet as mx
 import numpy as np
 import rl_data
@@ -153,11 +154,14 @@ def train():
                 module.forward(batch, is_train=True)
 
                 pi = module.get_outputs()[1]
-                h = args.beta*(mx.nd.log(pi+1e-6)+1)
-                module.backward([mx.nd.array(adv), h])
-
-                print 'pi', pi[0].asnumpy()
-                print 'h', h[0].asnumpy()
+                h = -args.beta*(mx.nd.log(pi+1e-7)*pi)
+                out_acts = np.amax(pi.asnumpy(), 1)
+                out_acts=np.reshape(out_acts,(-1,1))
+                out_acts_tile=np.tile(-np.log(out_acts + 1e-7),(1, dataiter.act_dim))
+                module.backward([mx.nd.array(out_acts_tile*adv), h])
+
+                print('pi', pi[0].asnumpy())
+                print('h', h[0].asnumpy())
                 err += (adv**2).mean()
                 score += r[i]
                 final_score *= (1-D[i]) 
@@ -167,8 +171,8 @@ def train():
 
             module.update()
             logging.info('fps: %f err: %f score: %f final: %f T: %f'%(args.batch_size/(time.time()-tic), err/args.t_max, score.mean(), final_score.mean(), T))
-            print score.squeeze()
-            print final_score.squeeze()
+            print(score.squeeze())
+            print(final_score.squeeze())
 
 def test():
     log_config()
@@ -178,7 +182,7 @@ def test():
 
     # module
     dataiter = robo_data.RobosimsDataIter('scenes', args.batch_size, args.input_length, web_viz=True)
-    print dataiter.provide_data
+    print(dataiter.provide_data)
     net = sym.get_symbol_thor(dataiter.act_dim)
     module = mx.mod.Module(net, data_names=[d[0] for d in dataiter.provide_data], label_names=('policy_label', 'value_label'), context=devs)
     module.bind(data_shapes=dataiter.provide_data,
diff --git a/example/reinforcement-learning/a3c/rl_data.py b/example/reinforcement-learning/a3c/rl_data.py
index f6deab4ac4c5..0d16bca793a4 100644
--- a/example/reinforcement-learning/a3c/rl_data.py
+++ b/example/reinforcement-learning/a3c/rl_data.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import mxnet as mx
 import numpy as np
 import gym
@@ -33,7 +34,7 @@ def video_feed():
     try:
         app.run(host='0.0.0.0', port=8889)
     except:
-        print 'unable to open port' 
+        print('unable to open port')
 
 def visual(X, show=True):
     X = X.transpose((0, 2, 3, 1))
@@ -148,7 +149,7 @@ def visual(self):
             dataiter.act([env.action_space.sample() for env in dataiter.env])
             dataiter.clear_history()
             dataiter.next()
-        print batch_size*100/(time.time() - tic)
+        print(batch_size*100/(time.time() - tic))
         tic = time.time()
 
 
diff --git a/example/reinforcement-learning/ddpg/replay_mem.py b/example/reinforcement-learning/ddpg/replay_mem.py
index 9c449cbd9d24..885d7da301f0 100644
--- a/example/reinforcement-learning/ddpg/replay_mem.py
+++ b/example/reinforcement-learning/ddpg/replay_mem.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy as np
 
 
@@ -69,9 +70,9 @@ def get_batch(self, batch_size):
     memory.add_sample(np.array([2, 2]), np.array([2]), 10, 0)
     memory.add_sample(np.array([2, 2]), np.array([2]), 10, 0)
     memory.add_sample(np.array([1, 1]), np.array([1]), 100, 1)
-    print memory.obss
-    print memory.acts 
-    print memory.rwds 
-    print memory.ends   
-    print memory.get_batch(5)
+    print(memory.obss)
+    print(memory.acts)
+    print(memory.rwds)
+    print(memory.ends)
+    print(memory.get_batch(5))
 
diff --git a/example/reinforcement-learning/parallel_actor_critic/README.md b/example/reinforcement-learning/parallel_actor_critic/README.md
new file mode 100644
index 000000000000..d734ceb19007
--- /dev/null
+++ b/example/reinforcement-learning/parallel_actor_critic/README.md
@@ -0,0 +1,34 @@
+# 'Parallel Advantage-Actor Critic' Implementation
+
+This repo contains a MXNet implementation of a variant of the A3C algorithm from [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783v2.pdf).
+
+Trajectories are obtained from multiple environments in a single process, batched together, and used to update the model with a single forward and backward pass.
+
+[Generalized Advantage Estimation](https://arxiv.org/pdf/1506.02438v5.pdf) is used to estimate the advantage function.
+
+Please see the accompanying [tutorial](https://minpy.readthedocs.io/en/latest/tutorial/rl_policy_gradient_tutorial/rl_policy_gradient.html#improved-rl-with-parallel-advantage-actor-critic) for additional background.
+
+Author: Sean Welleck ([@wellecks](https://github.com/wellecks)), Reed Lee ([@loofahcus](https://github.com/loofahcus))
+
+## Training
+
+#### Atari Pong
+
+The model can be trained on various OpenAI gym environments, but was primarily tested on `PongDeterministic-v3`. To train on
+this environment with default parameters (16 environments), use:
+
+```bash
+python train.py
+```
+
+Training a model to achieve a score of 20 takes roughly an hour on a Macbook Pro.
+
+#### Other environments
+
+Note that other environments may require additional tuning or architecture adjustments. Use `python train.py -h` to see the command-line arguments.
+For instance, to train on `CartPole-v0`, performing updates every 50 steps,
+use:
+
+```bash
+python train.py --env-type CartPole-v0 --t-max 50
+```
diff --git a/example/reinforcement-learning/parallel_actor_critic/config.py b/example/reinforcement-learning/parallel_actor_critic/config.py
new file mode 100644
index 000000000000..48ef1d0c5a38
--- /dev/null
+++ b/example/reinforcement-learning/parallel_actor_critic/config.py
@@ -0,0 +1,28 @@
+import mxnet as mx
+
+
+class Config(object):
+    def __init__(self, args):
+        # Default training settings
+        self.ctx = mx.gpu(0) if args.gpu else mx.cpu()
+        self.init_func = mx.init.Xavier(rnd_type='uniform', factor_type="in",
+                                        magnitude=1)
+        self.learning_rate = 1e-3
+        self.update_rule = "adam"
+        self.grad_clip = True
+        self.clip_magnitude = 40
+
+        # Default model settings
+        self.hidden_size = 200
+        self.gamma = 0.99
+        self.lambda_ = 1.0
+        self.vf_wt = 0.5        # Weight of value function term in the loss
+        self.entropy_wt = 0.01  # Weight of entropy term in the loss
+
+        self.num_envs = 16
+        self.t_max = 50
+
+        # Override defaults with values from `args`.
+        for arg in self.__dict__:
+            if arg in args.__dict__:
+                self.__setattr__(arg, args.__dict__[arg])
diff --git a/example/reinforcement-learning/parallel_actor_critic/envs.py b/example/reinforcement-learning/parallel_actor_critic/envs.py
new file mode 100644
index 000000000000..09f30d73cf2d
--- /dev/null
+++ b/example/reinforcement-learning/parallel_actor_critic/envs.py
@@ -0,0 +1,40 @@
+import numpy as np
+
+
+class Atari8080Preprocessor(object):
+    def __init__(self):
+        self.prev = None
+        self.obs_size = 80*80
+
+    def reset(self):
+        self.prev = None
+
+    def preprocess(self, img):
+        """
+        Preprocess a 210x160x3 uint8 frame into a 6400 (80x80) (1 x input_size)
+        float vector.
+        """
+        # Crop, down-sample, erase background and set foreground to 1.
+        # See https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5
+        img = img[35:195]
+        img = img[::2, ::2, 0]
+        img[img == 144] = 0
+        img[img == 109] = 0
+        img[img != 0] = 1
+        curr = np.expand_dims(img.astype(np.float).ravel(), axis=0)
+        # Subtract the last preprocessed image.
+        diff = (curr - self.prev if self.prev is not None
+                else np.zeros((1, curr.shape[1])))
+        self.prev = curr
+        return diff
+
+
+class IdentityPreprocessor(object):
+    def __init__(self, obs_size):
+        self.obs_size = obs_size
+
+    def reset(self):
+        pass
+
+    def preprocess(self, x):
+        return x
diff --git a/example/reinforcement-learning/parallel_actor_critic/model.py b/example/reinforcement-learning/parallel_actor_critic/model.py
new file mode 100644
index 000000000000..e54fe6088180
--- /dev/null
+++ b/example/reinforcement-learning/parallel_actor_critic/model.py
@@ -0,0 +1,111 @@
+from itertools import chain
+import numpy as np
+import scipy.signal
+import mxnet as mx
+
+
+class Agent(object):
+    def __init__(self, input_size, act_space, config):
+        super(Agent, self).__init__()
+        self.input_size = input_size
+        self.num_envs = config.num_envs
+        self.ctx = config.ctx
+        self.act_space = act_space
+        self.config = config
+
+        # Shared network.
+        net = mx.sym.Variable('data')
+        net = mx.sym.FullyConnected(
+            data=net, name='fc1', num_hidden=config.hidden_size, no_bias=True)
+        net = mx.sym.Activation(data=net, name='relu1', act_type="relu")
+
+        # Policy network.
+        policy_fc = mx.sym.FullyConnected(
+            data=net, name='policy_fc', num_hidden=act_space, no_bias=True)
+        policy = mx.sym.SoftmaxActivation(data=policy_fc, name='policy')
+        policy = mx.sym.clip(data=policy, a_min=1e-5, a_max=1 - 1e-5)
+        log_policy = mx.sym.log(data=policy, name='log_policy')
+        out_policy = mx.sym.BlockGrad(data=policy, name='out_policy')
+
+        # Negative entropy.
+        neg_entropy = policy * log_policy
+        neg_entropy = mx.sym.MakeLoss(
+            data=neg_entropy, grad_scale=config.entropy_wt, name='neg_entropy')
+
+        # Value network.
+        value = mx.sym.FullyConnected(data=net, name='value', num_hidden=1)
+
+        self.sym = mx.sym.Group([log_policy, value, neg_entropy, out_policy])
+        self.model = mx.mod.Module(self.sym, data_names=('data',),
+                                   label_names=None)
+
+        self.paralell_num = config.num_envs * config.t_max
+        self.model.bind(
+            data_shapes=[('data', (self.paralell_num, input_size))],
+            label_shapes=None,
+            grad_req="write")
+
+        self.model.init_params(config.init_func)
+
+        optimizer_params = {'learning_rate': config.learning_rate,
+                            'rescale_grad': 1.0}
+        if config.grad_clip:
+            optimizer_params['clip_gradient'] = config.clip_magnitude
+
+        self.model.init_optimizer(
+            kvstore='local', optimizer=config.update_rule,
+            optimizer_params=optimizer_params)
+
+    def act(self, ps):
+        us = np.random.uniform(size=ps.shape[0])[:, np.newaxis]
+        as_ = (np.cumsum(ps, axis=1) > us).argmax(axis=1)
+        return as_
+
+    def train_step(self, env_xs, env_as, env_rs, env_vs):
+        # NOTE(reed): Rebind to set the data shape.
+        self.model.bind(
+            data_shapes=[('data', (self.paralell_num, self.input_size))],
+            label_shapes=None, for_training=True,
+            force_rebind=True, grad_req="write")
+
+        xs = mx.nd.array(env_xs, ctx=self.ctx)
+        as_ = np.array(list(chain.from_iterable(env_as)))
+
+        # Compute discounted rewards and advantages.
+        advs = []
+        gamma, lambda_ = self.config.gamma, self.config.lambda_
+        for i in xrange(len(env_vs)):
+            # Compute advantages using Generalized Advantage Estimation;
+            # see eqn. (16) of [Schulman 2016].
+            delta_t = (env_rs[i] + gamma*np.array(env_vs[i][1:]) -
+                       np.array(env_vs[i][:-1]))
+            advs.extend(self._discount(delta_t, gamma * lambda_))
+
+        # Negative generalized advantage estimations.
+        neg_advs_v = -np.asarray(advs)
+
+        # NOTE(reed): Only keeping the grads for selected actions.
+        neg_advs_np = np.zeros((len(advs), self.act_space), dtype=np.float32)
+        neg_advs_np[np.arange(neg_advs_np.shape[0]), as_] = neg_advs_v
+        neg_advs = mx.nd.array(neg_advs_np, ctx=self.ctx)
+
+        # NOTE(reed): The grads of values is actually negative advantages.
+        v_grads = mx.nd.array(self.config.vf_wt * neg_advs_v[:, np.newaxis],
+                              ctx=self.ctx)
+
+        data_batch = mx.io.DataBatch(data=[xs], label=None)
+        self._forward_backward(data_batch=data_batch,
+                               out_grads=[neg_advs, v_grads])
+
+        self._update_params()
+
+    def _discount(self, x, gamma):
+        return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
+
+    def _forward_backward(self, data_batch, out_grads=None):
+        self.model.forward(data_batch, is_train=True)
+        self.model.backward(out_grads=out_grads)
+
+    def _update_params(self):
+        self.model.update()
+        self.model._sync_params_from_devices()
diff --git a/example/reinforcement-learning/parallel_actor_critic/train.py b/example/reinforcement-learning/parallel_actor_critic/train.py
new file mode 100644
index 000000000000..6974f1fc014d
--- /dev/null
+++ b/example/reinforcement-learning/parallel_actor_critic/train.py
@@ -0,0 +1,173 @@
+"""Trains an `Agent` using trajectories from multiple environments."""
+
+import argparse
+from itertools import chain
+import time
+import gym
+import numpy as np
+import mxnet as mx
+from config import Config
+from envs import Atari8080Preprocessor, IdentityPreprocessor
+from model import Agent
+
+
+def train_episode(agent, envs, preprocessors, t_max, render):
+    """Complete an episode's worth of training for each environment."""
+    num_envs = len(envs)
+
+    # Buffers to hold trajectories, e.g. `env_xs[i]` will hold the observations
+    # for environment `i`.
+    env_xs, env_as = _2d_list(num_envs), _2d_list(num_envs)
+    env_rs, env_vs = _2d_list(num_envs), _2d_list(num_envs)
+    episode_rs = np.zeros(num_envs, dtype=np.float)
+
+    for p in preprocessors:
+        p.reset()
+
+    observations = [p.preprocess(e.reset())
+                    for p, e in zip(preprocessors, envs)]
+
+    done = np.array([False for _ in range(num_envs)])
+    any_done = False
+    t = 1
+
+    # NOTE(reed): For simplicity, it stops this episode when any of the envs is
+    # done. As a side effect, the episode_rs may appear to vibrate for the
+    # initial rounds instead of decreasing gradually.
+    while not any_done:
+        if render:
+            envs[0].render()
+
+        # NOTE(reed): Rebind to set the data shape.
+        agent.model.bind(
+            data_shapes=[('data', (num_envs, preprocessors[0].obs_size))],
+            label_shapes=None,
+            for_training=False,
+            force_rebind=True,
+            grad_req="null")
+
+        step_xs = np.vstack([o.ravel() for o in observations])
+
+        # Get actions and values for all environments in a single forward pass.
+        step_xs_nd = mx.nd.array(step_xs, ctx=agent.ctx)
+        data_batch = mx.io.DataBatch(data=[step_xs_nd], label=None)
+        agent.model.forward(data_batch, is_train=False)
+        _, step_vs, _, step_ps = agent.model.get_outputs()
+
+        step_ps = step_ps.asnumpy()
+        step_vs = step_vs.asnumpy()
+        step_as = agent.act(step_ps)
+
+        # Step each environment whose episode has not completed.
+        for i, env in enumerate(envs):
+            if not done[i]:
+                obs, r, done[i], _ = env.step(step_as[i])
+
+                # Record the observation, action, value, and reward in the
+                # buffers.
+                env_xs[i].append(step_xs[i].ravel())
+                env_as[i].append(step_as[i])
+                env_vs[i].append(step_vs[i][0])
+                env_rs[i].append(r)
+                episode_rs[i] += r
+
+                # Add 0 as the state value when done.
+                if done[i]:
+                    env_vs[i].append(0.0)
+                else:
+                    observations[i] = preprocessors[i].preprocess(obs)
+            else:
+                any_done = True
+
+        # Perform an update every `t_max` steps.
+        if t == t_max and not any_done:
+            # If the episode has not finished, add current state's value. This
+            # will be used to 'bootstrap' the final return (see Algorithm S3
+            # in A3C paper).
+            step_xs = np.vstack([o.ravel() for o in observations])
+            step_xs_nd = mx.nd.array(step_xs, ctx=agent.ctx)
+            data_batch = mx.io.DataBatch(data=[step_xs_nd], label=None)
+            agent.model.forward(data_batch, is_train=False)
+            _, extra_vs, _, _ = agent.model.get_outputs()
+            extra_vs = extra_vs.asnumpy()
+            for i in range(num_envs):
+                if not done[i]:
+                    env_vs[i].append(extra_vs[i][0])
+
+            # Perform update and clear buffers.
+            env_xs = np.vstack(list(chain.from_iterable(env_xs)))
+            agent.train_step(env_xs, env_as, env_rs, env_vs)
+            env_xs, env_as = _2d_list(num_envs), _2d_list(num_envs)
+            env_rs, env_vs = _2d_list(num_envs), _2d_list(num_envs)
+            t = 0
+
+        t += 1
+
+    return episode_rs
+
+
+def _2d_list(n):
+    return [[] for _ in range(n)]
+
+
+def save_params(save_pre, model, epoch):
+    model.save_checkpoint(save_pre, epoch, save_optimizer_states=True)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-envs', type=int, default=16)
+    parser.add_argument('--t-max', type=int, default=50)
+    parser.add_argument('--env-type', default='PongDeterministic-v3')
+    parser.add_argument('--render', action='store_true')
+    parser.add_argument('--save-pre', default='checkpoints')
+    parser.add_argument('--save-every', type=int, default=0)
+    parser.add_argument('--num-episodes', type=int, default=100000)
+    parser.add_argument('--learning-rate', type=float, default=1e-3)
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--print-every', type=int, default=1)
+    parser.add_argument('--gpu', action='store_true')
+
+    # Parse arguments and setup configuration `config`
+    args = parser.parse_args()
+    config = Config(args)
+    print('args=%s' % args)
+    print('config=%s' % config.__dict__)
+    np.random.seed(args.seed)
+
+    # Create and seed the environments
+    envs = [gym.make(args.env_type) for _ in range(args.num_envs)]
+    if args.env_type == 'CartPole-v0':
+        preprocessors = [
+            IdentityPreprocessor(np.prod(envs[0].observation_space.shape))
+            for _ in range(args.num_envs)]
+    else:
+        preprocessors = [Atari8080Preprocessor() for _ in range(args.num_envs)]
+    for i, env in enumerate(envs):
+        env.seed(i+args.seed)
+
+    agent = Agent(preprocessors[0].obs_size, envs[0].action_space.n,
+                  config=config)
+
+    # Train
+    running_reward = None
+    start = time.time()
+    for i in range(args.num_episodes):
+        tic = time.time()
+        episode_rs = train_episode(
+            agent, envs, preprocessors, t_max=args.t_max, render=args.render)
+
+        for er in episode_rs:
+            running_reward = er if running_reward is None else (
+                0.99 * running_reward + 0.01 * er)
+
+        if i % args.print_every == 0:
+            print('Batch %d complete (%.2fs) (%.1fs elapsed) (episode %d), '
+                  'batch avg. reward: %.2f, running reward: %.3f' %
+                  (i, time.time() - tic, time.time() - start,
+                   (i + 1) * args.num_envs, np.mean(episode_rs),
+                   running_reward))
+
+        if args.save_every > 0:
+            if i % args.save_every == 0:
+                save_params(args.save_pre, agent.model, i)
diff --git a/example/rnn-time-major/bucket_io.py b/example/rnn-time-major/bucket_io.py
index a929eab99e71..5cf2c81967a8 100644
--- a/example/rnn-time-major/bucket_io.py
+++ b/example/rnn-time-major/bucket_io.py
@@ -1,5 +1,6 @@
 # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
 # pylint: disable=superfluous-parens, no-member, invalid-name
+from __future__ import print_function
 import sys
 sys.path.insert(0, "../../python")
 import numpy as np
@@ -114,11 +115,11 @@ def __init__(self, path, vocab, buckets, batch_size,
                  time_major=True):
         super(BucketSentenceIter, self).__init__()
 
-        if text2id == None:
+        if text2id is None:
             self.text2id = default_text2id
         else:
             self.text2id = text2id
-        if read_content == None:
+        if read_content is None:
             self.read_content = default_read_content
         else:
             self.read_content = read_content
diff --git a/example/rnn/cudnn_lstm_bucketing.py b/example/rnn/cudnn_lstm_bucketing.py
index b80c1e0d03ec..35914dea8fac 100644
--- a/example/rnn/cudnn_lstm_bucketing.py
+++ b/example/rnn/cudnn_lstm_bucketing.py
@@ -16,6 +16,8 @@
                     help='hidden layer size')
 parser.add_argument('--num-embed', type=int, default=200,
                     help='embedding layer size')
+parser.add_argument('--bidirectional', type=bool, default=False,
+                    help='whether to use bidirectional layers')
 parser.add_argument('--gpus', type=str,
                     help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \
                          'Increase batch size when using multiple gpus for best performance.')
@@ -35,7 +37,16 @@
                     help='the batch size.')
 parser.add_argument('--disp-batches', type=int, default=50,
                     help='show progress for every n batches')
-
+# When training a deep, complex model, it's recommended to stack fused RNN cells (one
+# layer per cell) together instead of one with all layers. The reason is that fused RNN
+# cells doesn't set gradients to be ready until the computation for the entire layer is
+# completed. Breaking a multi-layer fused RNN cell into several one-layer ones allows
+# gradients to be processed ealier. This reduces communication overhead, especially with
+# multiple GPUs.
+parser.add_argument('--stack-rnn', default=False,
+                    help='stack fused RNN cells to reduce communication overhead')
+parser.add_argument('--dropout', type=float, default='0.0',
+                    help='dropout probability (1.0 - keep probability)')
 
 #buckets = [32]
 buckets = [10, 20, 30, 40, 50, 60]
@@ -64,8 +75,17 @@ def get_data(layout):
 
 def train(args):
     data_train, data_val, vocab = get_data('TN')
-
-    cell = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, mode='lstm')
+    if args.stack_rnn:
+        cell = mx.rnn.SequentialRNNCell()
+        for i in range(args.num_layers):
+            cell.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=1,
+                                         mode='lstm', prefix='lstm_l%d'%i,
+                                         bidirectional=args.bidirectional))
+            if args.dropout > 0 and i < args.num_layers - 1:
+                cell.add(mx.rnn.DropoutCell(args.dropout, prefix='lstm_d%d'%i))
+    else:
+        cell = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
+                                   mode='lstm', bidirectional=args.bidirectional)
 
     def sym_gen(seq_len):
         data = mx.sym.Variable('data')
@@ -74,7 +94,8 @@ def sym_gen(seq_len):
 
         output, _ = cell.unroll(seq_len, inputs=embed, merge_outputs=True, layout='TNC')
 
-        pred = mx.sym.Reshape(output, shape=(-1, args.num_hidden))
+        pred = mx.sym.Reshape(output,
+                shape=(-1, args.num_hidden*(1+args.bidirectional)))
         pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')
 
         label = mx.sym.Reshape(label, shape=(-1,))
@@ -99,15 +120,21 @@ def sym_gen(seq_len):
         arg_params = None
         aux_params = None
 
+    opt_params = {
+      'learning_rate': args.lr,
+      'wd': args.wd
+    }
+
+    if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']:
+        opt_params['momentum'] = args.mom
+
     model.fit(
         train_data          = data_train,
         eval_data           = data_val,
         eval_metric         = mx.metric.Perplexity(invalid_label),
         kvstore             = args.kv_store,
         optimizer           = args.optimizer,
-        optimizer_params    = { 'learning_rate': args.lr,
-                                'momentum': args.mom,
-                                'wd': args.wd },
+        optimizer_params    = opt_params, 
         initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
         arg_params          = arg_params,
         aux_params          = aux_params,
@@ -121,9 +148,19 @@ def test(args):
     assert args.model_prefix, "Must specifiy path to load from"
     _, data_val, vocab = get_data('NT')
 
-    stack = mx.rnn.SequentialRNNCell()
-    for i in range(args.num_layers):
-        stack.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i))
+    if not args.stack_rnn:
+        stack = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers,
+                mode='lstm', bidirectional=args.bidirectional).unfuse()
+    else:
+        stack = mx.rnn.SequentialRNNCell()
+        for i in range(args.num_layers):
+            cell = mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_%dl0_'%i)
+            if args.bidirectional:
+                cell = mx.rnn.BidirectionalCell(
+                        cell,
+                        mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_%dr0_'%i),
+                        output_prefix='bi_lstm_%d'%i)
+            stack.add(cell)
 
     def sym_gen(seq_len):
         data = mx.sym.Variable('data')
@@ -131,9 +168,11 @@ def sym_gen(seq_len):
         embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
                                  output_dim=args.num_embed, name='embed')
 
+        stack.reset()
         outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
 
-        pred = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden))
+        pred = mx.sym.Reshape(outputs,
+                shape=(-1, args.num_hidden*(1+args.bidirectional)))
         pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred')
 
         label = mx.sym.Reshape(label, shape=(-1,))
@@ -165,6 +204,10 @@ def sym_gen(seq_len):
     logging.basicConfig(level=logging.DEBUG, format=head)
 
     args = parser.parse_args()
+
+    if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn:
+        print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs')
+
     if args.test:
         # Demonstrates how to load a model trained with CuDNN RNN and predict
         # with non-fused MXNet symbol
diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py
index b764639ebdea..4bc934a01ad0 100644
--- a/example/rnn/lstm_bucketing.py
+++ b/example/rnn/lstm_bucketing.py
@@ -34,7 +34,8 @@
 def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
     lines = open(fname).readlines()
     lines = [filter(None, i.split(' ')) for i in lines]
-    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, start_label=start_label)
+    sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label,
+                                               start_label=start_label)
     return sentences, vocab
 
 
@@ -61,15 +62,17 @@ def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
     data_val    = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets,
                                             invalid_label=invalid_label)
 
+    stack = mx.rnn.SequentialRNNCell()
+    for i in range(args.num_layers):
+        stack.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i))
+
     def sym_gen(seq_len):
         data = mx.sym.Variable('data')
         label = mx.sym.Variable('softmax_label')
         embed = mx.sym.Embedding(data=data, input_dim=len(vocab),
                                  output_dim=args.num_embed, name='embed')
 
-        stack = mx.rnn.SequentialRNNCell()
-        for i in range(args.num_layers):
-            stack.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i))
+        stack.reset()
         outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
 
         pred = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden))
diff --git a/example/rnn/old/bucket_io.py b/example/rnn/old/bucket_io.py
index 4d281fe3122f..f515e348c4f4 100644
--- a/example/rnn/old/bucket_io.py
+++ b/example/rnn/old/bucket_io.py
@@ -1,5 +1,6 @@
 # pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
 # pylint: disable=superfluous-parens, no-member, invalid-name
+from __future__ import print_function
 import sys
 sys.path.insert(0, "../../python")
 import numpy as np
@@ -117,11 +118,11 @@ def __init__(self, path, vocab, buckets, batch_size,
                  seperate_char=' <eos> ', text2id=None, read_content=None, model_parallel=False):
         super(BucketSentenceIter, self).__init__()
 
-        if text2id == None:
+        if text2id is None:
             self.text2id = default_text2id
         else:
             self.text2id = text2id
-        if read_content == None:
+        if read_content is None:
             self.read_content = default_read_content
         else:
             self.read_content = read_content
@@ -225,7 +226,7 @@ def __iter__(self):
             # Model parallelism 
             if self.model_parallel:
                 if self.data[i_bucket][:, idx].shape[1] == 0:
-                    print "WARNING: detected shape " + str(self.data[i_bucket][:, idx].shape)
+                    print("WARNING: detected shape " + str(self.data[i_bucket][:, idx].shape))
                     continue
                 data[:] = self.data[i_bucket][:, idx]
                 data_batch = ModelParallelBatch(data, self.buckets[i_bucket])
diff --git a/example/speech-demo/io_func/feat_io.py b/example/speech-demo/io_func/feat_io.py
index 1f784397c095..83d417eb0ffb 100644
--- a/example/speech-demo/io_func/feat_io.py
+++ b/example/speech-demo/io_func/feat_io.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import os
 import sys
 import random
@@ -318,7 +319,7 @@ def load_next_block(self):
             (loaded_feat, loaded_label) = tup
 
             if self.has_labels and loaded_label is None:
-                print >> sys.stderr, "Missing labels for: ", self.utt_id
+                print(sys.stderr, "Missing labels for: ", self.utt_id)
                 continue
 
             numFrames = loaded_feat.shape[0]
diff --git a/example/speech-demo/io_func/feat_readers/reader_bvec.py b/example/speech-demo/io_func/feat_readers/reader_bvec.py
index 06537d1a8a69..ac68bf477c05 100644
--- a/example/speech-demo/io_func/feat_readers/reader_bvec.py
+++ b/example/speech-demo/io_func/feat_readers/reader_bvec.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import struct
 import array
 import numpy
@@ -18,8 +19,8 @@ def Read(self):
             numSamples = header[0]['numSamples']
             dim        = header[0]['dim']
 
-            print 'Num samples = {}'.format(numSamples)
-            print 'dim = {}'.format(dim)
+            print('Num samples = {}'.format(numSamples))
+            print('dim = {}'.format(dim))
 
             dt = numpy.dtype([('sample',(numpy.float32,dim))]) 
             samples = numpy.fromfile(f,dt.newbyteorder('>'),count=numSamples)
diff --git a/example/speech-demo/io_func/feat_readers/stats.py b/example/speech-demo/io_func/feat_readers/stats.py
index 12e7152126a2..70033ebae456 100644
--- a/example/speech-demo/io_func/feat_readers/stats.py
+++ b/example/speech-demo/io_func/feat_readers/stats.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy
 
 class _StreamVariance(object):
@@ -50,7 +51,7 @@ def GetMean(self):
         return self.mean
 
     def GetVariance(self):
-        return numpy.power(self.GetStd(),2)
+        return numpy.power(self.GetStd(), 2)
 
     def GetStd(self):
         return 1.0/self.invStd
@@ -64,16 +65,16 @@ def GetStatsFromList(self,fileList,featureFileHandler):
         stats = None
 
         for featureFile,label in featureList.FeatureList(fileList):
-            if stats == None:
+            if stats is None:
                 self.dim = self.getDimFromFile(featureFile,featureFileHandler)
                 stats    = _StreamVariance(self.dim)
 
             samples = featureFileHandler.Read(featureFile)
 
-            print 'Process file : "{}"'.format(featureFile)
+            print('Process file : "{}"'.format(featureFile))
             stats.AddX(samples)
 
-        print 'Read {} samples'.format(stats.GetNumberOfSamples())
+        print('Read {} samples'.format(stats.GetNumberOfSamples()))
         self.mean           = stats.GetMean()
         self.invStd         = stats.GetInvStandardDeviation()
         self.populationSize = stats.GetNumberOfSamples()
@@ -117,9 +118,9 @@ def Save(self,filename):
         with open(filename,'wb') as f:
             dt = numpy.dtype([('magicNumber',(numpy.int32,1)),('numSamples',(numpy.int32,1)),('dim',(numpy.int32,1))])
             header=numpy.zeros((1,),dtype=dt)
-            header[0]['magicNumber']=21812
-            header[0]['numSamples']=self.populationSize
-            header[0]['dim']=self.mean.shape[0]
+            header[0]['magicNumber'] = 21812
+            header[0]['numSamples'] = self.populationSize
+            header[0]['dim'] = self.mean.shape[0]
             header.tofile(f)
 
             self.mean.astype(numpy.float32).tofile(f)
@@ -130,7 +131,7 @@ def Save(self,filename):
     import argparse
 
     parser = argparse.ArgumentParser(description='Print the mean and standard deviation from a stat file',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('filename',help="Name of the stat file")
+    parser.add_argument('filename', help="Name of the stat file")
     args = parser.parse_args()
     featureStats = FeatureStats()
     featureStats.Load(args.filename)
diff --git a/example/speech-demo/io_func/feat_readers/writer_kaldi.py b/example/speech-demo/io_func/feat_readers/writer_kaldi.py
index acb78917bb45..f331160a4f3d 100644
--- a/example/speech-demo/io_func/feat_readers/writer_kaldi.py
+++ b/example/speech-demo/io_func/feat_readers/writer_kaldi.py
@@ -18,8 +18,8 @@ def __init__(self, scp_path, ark_path):
             raise Exception("output file needs to be little endian")
 
     def open(self):
-        self.out_ark = open(self.ark_path,"w")
-        self.out_scp = open(self.scp_path,"w")
+        self.out_ark = open(self.ark_path, "w")
+        self.out_scp = open(self.scp_path, "w")
 
     def open_or_fd(self):
         offset = None
@@ -27,12 +27,12 @@ def open_or_fd(self):
             #self.out_ark = os.popen(sys.stdout, 'wb')
             self.out_ark = sys.stdout
         else:
-            self.out_ark = open(self.ark_path,"w")
+            self.out_ark = open(self.ark_path, "w")
     def write(self, uttID, data):
         assert data.dtype == numpy.float32
 
         self.out_ark.write(uttID + ' ')
-        if self.out_scp != None:
+        if self.out_scp is not None:
             start_offset = self.out_ark.tell()
 
         # write out ark
@@ -47,11 +47,11 @@ def write(self, uttID, data):
         self.out_ark.flush()
 
         # write out scp
-        if self.out_scp != None:
+        if self.out_scp is not None:
             scp_out = uttID + ' ' + self.ark_path + ':' + str(start_offset)
             self.out_scp.write(scp_out + '\n')
 
     def close(self):
         self.out_ark.close()
-        if self.out_scp != None:
+        if self.out_scp is not None:
             self.out_scp.close()
diff --git a/example/speech-demo/io_func/kaldi_parser.py b/example/speech-demo/io_func/kaldi_parser.py
index 44e588f38728..8b1d67893b79 100644
--- a/example/speech-demo/io_func/kaldi_parser.py
+++ b/example/speech-demo/io_func/kaldi_parser.py
@@ -1,191 +1,192 @@
+from __future__ import print_function
 import struct
 import numpy as num
 import sys
 
-class KaldiParser:
-
-	NO_OPEN_BRACKET = "found > before <"
-	ERR_NO_CLOSE_BRACKET = "reached eof before >"
-	ERR_BYTES_BEFORE_TOKEN = "found bytes before <"
-	NO_SPACE_AFTER = "missing space after >"
-
-	def __init__(self, f):
-		self.f = f
-		self.binary = self.f.read(2) == '\0B'
-		assert(self.binary), "text format not supported yet"
-		if not self.binary:
-			self.f.seek(0, 0)
-
-	def is_binary(self):
-		return self.binary
-
-	def try_next_token(self):
-		pos = self.f.tell()
-		err, tok = self.next_token()
-		if err is not None:
-			self.f.seek(pos, 0)
-			print err, tok
-			return None
-		return tok.lower()
-
-	def next_token(self):
-		# keep reading until you get a > or at end of file (return None)
-		# consume the space
-		# return substring from < to >
-		# if things before < are not space, return error
-		buf = ""
-		while True:
-			b = self.f.read(1)
-			if b is None:
-				return KaldiParser.ERR_NO_CLOSE_BRACKET, None
-			buf += b
-			if b == ">":
-				break
-
-		try:
-			start = buf.index("<")
-		except ValueError:
-			return KaldiParser.NO_OPEN_BRACKET, None
-
-		b = self.f.read(1)
-		if not (b == " " or b is None):
-			return KaldiParser.NO_SPACE_AFTER, buf[start:]
-
-		if start != 0:
-			return KaldiParser.ERR_BYTES_BEFORE_TOKEN, buf[start:]
-
-		return None, buf
-
-	def read_space(self):
-		b = self.f.read(1)
-		assert(b == " " or b is None)
-
-	# http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
-	def read_basic_type(self, type):
-		if self.binary:
-			size = num.fromfile(self.f, dtype=num.dtype("i1"), count=1)[0]
-
-			if type == "int":
-				dtype = "<i4"
-				dsize = 4
-			elif type == "float":
-				dtype = "<f4"
-				dsize = 4
-			elif type == "char":
-				dtype = 'a'
-				dsize = 1
-			else:
-				print "unrecognized type"
-				return None
-
-			assert(size == dsize)
-			n = num.fromfile(self.f, dtype=num.dtype(dtype), count=1)
-			return n[0]
-
-		else:
-			assert(False), "not supported yet"
-
-	def read_matrix(self):
-		mode = self.f.read(2)
-		#print mode
-		assert(mode == 'FM')
-		self.read_space()
-
-		rows = self.read_basic_type("int")
-		#print "rows", rows
-		cols = self.read_basic_type("int")
-		#print "cols", cols
-
-		n = num.fromfile(self.f, dtype=num.dtype("<f4"), count=rows * cols)
-		n = n.reshape((rows, cols))
-
-		#print n[0][0]
-		#print "-----------"
-		return n
-
-	def read_vector(self):
-		mode = self.f.read(2)
-		#print mode
-		assert(mode == 'FV')
-		self.read_space()
-
-		length = self.read_basic_type("int")
-		#print "length", length
-
-		n = num.fromfile(self.f, dtype=num.dtype("<f4"), count=length)
-		#print n[0]
-		#print "-----------"
-		return n
+class KaldiParser(object):
+
+    NO_OPEN_BRACKET = "found > before <"
+    ERR_NO_CLOSE_BRACKET = "reached eof before >"
+    ERR_BYTES_BEFORE_TOKEN = "found bytes before <"
+    NO_SPACE_AFTER = "missing space after >"
+
+    def __init__(self, f):
+        self.f = f
+        self.binary = self.f.read(2) == '\0B'
+        assert(self.binary), "text format not supported yet"
+        if not self.binary:
+            self.f.seek(0, 0)
+
+    def is_binary(self):
+        return self.binary
+
+    def try_next_token(self):
+        pos = self.f.tell()
+        err, tok = self.next_token()
+        if err is not None:
+            self.f.seek(pos, 0)
+            print(err, tok)
+            return None
+        return tok.lower()
+
+    def next_token(self):
+        # keep reading until you get a > or at end of file (return None)
+        # consume the space
+        # return substring from < to >
+        # if things before < are not space, return error
+        buf = ""
+        while True:
+            b = self.f.read(1)
+            if b is None:
+                return KaldiParser.ERR_NO_CLOSE_BRACKET, None
+            buf += b
+            if b == ">":
+                break
+
+        try:
+            start = buf.index("<")
+        except ValueError:
+            return KaldiParser.NO_OPEN_BRACKET, None
+
+        b = self.f.read(1)
+        if not (b == " " or b is None):
+            return KaldiParser.NO_SPACE_AFTER, buf[start:]
+
+        if start != 0:
+            return KaldiParser.ERR_BYTES_BEFORE_TOKEN, buf[start:]
+
+        return None, buf
+
+    def read_space(self):
+        b = self.f.read(1)
+        assert(b == " " or b is None)
+
+    # http://docs.scipy.org/doc/numpy/reference/arrays.dtypes.html
+    def read_basic_type(self, type):
+        if self.binary:
+            size = num.fromfile(self.f, dtype=num.dtype("i1"), count=1)[0]
+
+            if type == "int":
+                dtype = "<i4"
+                dsize = 4
+            elif type == "float":
+                dtype = "<f4"
+                dsize = 4
+            elif type == "char":
+                dtype = 'a'
+                dsize = 1
+            else:
+                print("unrecognized type")
+                return None
+
+            assert(size == dsize)
+            n = num.fromfile(self.f, dtype=num.dtype(dtype), count=1)
+            return n[0]
+
+        else:
+            assert(False), "not supported yet"
+
+    def read_matrix(self):
+        mode = self.f.read(2)
+        #print mode
+        assert(mode == 'FM')
+        self.read_space()
+
+        rows = self.read_basic_type("int")
+        #print "rows", rows
+        cols = self.read_basic_type("int")
+        #print "cols", cols
+
+        n = num.fromfile(self.f, dtype=num.dtype("<f4"), count=rows * cols)
+        n = n.reshape((rows, cols))
+
+        #print n[0][0]
+        #print "-----------"
+        return n
+
+    def read_vector(self):
+        mode = self.f.read(2)
+        #print mode
+        assert(mode == 'FV')
+        self.read_space()
+
+        length = self.read_basic_type("int")
+        #print "length", length
+
+        n = num.fromfile(self.f, dtype=num.dtype("<f4"), count=length)
+        #print n[0]
+        #print "-----------"
+        return n
 
 def fileIsBinary(filename):
-	f = open(filename, "rb")
-	binary = (f.read(2) == '\0B')
-	f.seek(0, 0)
-	return binary
+    f = open(filename, "rb")
+    binary = (f.read(2) == '\0B')
+    f.seek(0, 0)
+    return binary
 
 def file2nnet_binary(filename):
-	f = open(filename, "rb")
-	parser = KaldiParser(f)
-
-	net = []
-	layer = None
-	while True:
-		tok = parser.try_next_token()
-		if tok is None:
-			print "error"
-			break
-		if tok == "<nnet>":
-			continue
-		elif tok == "<affinetransform>":
-			if layer is not None:
-				net += [layer]
-			layer = {}
-			layer["outdim"] = parser.read_basic_type("int")
-			layer["indim"] = parser.read_basic_type("int")
-		elif tok == "<learnratecoef>":
-			parser.read_basic_type("float")
-		elif tok == "<biaslearnratecoef>":
-			parser.read_basic_type("float")
-		elif tok == "<maxnorm>":
-			parser.read_basic_type("float")
-			layer["weights"] = parser.read_matrix().transpose()		# kaldi writes the transpose!!!!
-			layer["bias"] = parser.read_vector()
-		elif tok == "<sigmoid>" or tok == "<softmax>":
-			layer["type"] = tok[1:-1]
-			outdim1 = parser.read_basic_type("int")
-			outdim2 = parser.read_basic_type("int")
-			assert(outdim1 == outdim2 and outdim2 == layer["outdim"])
-		elif tok == "</nnet>":
-			#print "Done!"
-			break
-		else:
-			print "unrecognized token", tok
-			break
-
-	if layer is not None:
-		net += [layer]
-
-	#for layer in net:
-	#	print layer.keys()
-
-	return net
+    f = open(filename, "rb")
+    parser = KaldiParser(f)
+
+    net = []
+    layer = None
+    while True:
+        tok = parser.try_next_token()
+        if tok is None:
+            print("error")
+            break
+        if tok == "<nnet>":
+            continue
+        elif tok == "<affinetransform>":
+            if layer is not None:
+                net += [layer]
+            layer = {}
+            layer["outdim"] = parser.read_basic_type("int")
+            layer["indim"] = parser.read_basic_type("int")
+        elif tok == "<learnratecoef>":
+            parser.read_basic_type("float")
+        elif tok == "<biaslearnratecoef>":
+            parser.read_basic_type("float")
+        elif tok == "<maxnorm>":
+            parser.read_basic_type("float")
+            layer["weights"] = parser.read_matrix().transpose()        # kaldi writes the transpose!!!!
+            layer["bias"] = parser.read_vector()
+        elif tok == "<sigmoid>" or tok == "<softmax>":
+            layer["type"] = tok[1:-1]
+            outdim1 = parser.read_basic_type("int")
+            outdim2 = parser.read_basic_type("int")
+            assert(outdim1 == outdim2 and outdim2 == layer["outdim"])
+        elif tok == "</nnet>":
+            #print "Done!"
+            break
+        else:
+            print("unrecognized token", tok)
+            break
+
+    if layer is not None:
+        net += [layer]
+
+    #for layer in net:
+    #    print layer.keys()
+
+    return net
 
 if __name__ == '__main__':
-	filename = "exp/dnn4_pretrain-dbn_dnn/nnet_6.dbn_dnn.init"
-	#filename = "/usr/users/leoliu/s5/exp/dnn4_pretrain-dbn_dnn/final.feature_transform"
-	print filename
+    filename = "exp/dnn4_pretrain-dbn_dnn/nnet_6.dbn_dnn.init"
+    #filename = "/usr/users/leoliu/s5/exp/dnn4_pretrain-dbn_dnn/final.feature_transform"
+    print(filename)
 
-	print "isBinary:", fileIsBinary(filename)
-	a = file2nnet_binary(filename)
+    print("isBinary:", fileIsBinary(filename))
+    a = file2nnet_binary(filename)
 
 
 
-	"""
-	while True:
-		err, tok = parser.next_token()
-		if err != KaldiParser.NO_SPACE_AFTER and tok != None:
-			print err, tok
-	"""
+    """
+    while True:
+        err, tok = parser.next_token()
+        if err != KaldiParser.NO_SPACE_AFTER and tok is not None:
+            print(err, tok)
+    """
 
 """
         fout.write('<affinetransform> ' + str(output_size) + ' ' + str(input_size) + '\n')
diff --git a/example/speech-demo/io_func/model_io.py b/example/speech-demo/io_func/model_io.py
index 0129a612591c..18496634e462 100755
--- a/example/speech-demo/io_func/model_io.py
+++ b/example/speech-demo/io_func/model_io.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import numpy as np
 import os
 import sys
@@ -106,7 +107,7 @@ def _file2nnet(layers, set_layer_num = -1, filename='nnet.in', activation='sigmo
 
     # if is KALDI binary
     if fileIsBinary(filename):
-        print "Warning dropout factors ignored here"
+        print("Warning dropout factors ignored here")
 
         nnet = file2nnet_binary(filename)
 
@@ -119,7 +120,7 @@ def _file2nnet(layers, set_layer_num = -1, filename='nnet.in', activation='sigmo
             layers[i].params[1].set_value(nnet[i]["bias"].astype(dtype=theano.config.floatX))
 
         if withfinal:
-            #print nnet[-1]["weights"][0][0:10]
+            #print(nnet[-1]["weights"][0][0:10])
             layers[-1].params[0].set_value(nnet[-1]["weights"].astype(dtype=theano.config.floatX))
             layers[-1].params[1].set_value(nnet[-1]["bias"].astype(dtype=theano.config.floatX))
 
diff --git a/example/speech-demo/python_wrap/example_usage/example.py b/example/speech-demo/python_wrap/example_usage/example.py
index 9e8722b6e0f9..766bb6ebb3b5 100644
--- a/example/speech-demo/python_wrap/example_usage/example.py
+++ b/example/speech-demo/python_wrap/example_usage/example.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import ctypes
 import numpy
 
@@ -40,16 +41,16 @@ def decl(f, restype, argtypes):
 decl(kaldi.MatrixF_Data,        c_float_ptr, [c_void_p])
 
 if __name__ == "__main__":
-    print "-------- Foo class example --------"
+    print("-------- Foo class example --------")
     a = kaldi.Foo_new()
-    print "Calling Foo_bar(): ",
+    print("Calling Foo_bar(): ",)
     kaldi.Foo_bar(a)
-    print
-    print "Result of Foo_getx(): ", kaldi.Foo_getx(a)
-    print "Result of Foo_sizex(): ", kaldi.Foo_sizex(a)
+    print()
+    print("Result of Foo_getx(): ", kaldi.Foo_getx(a))
+    print("Result of Foo_sizex(): ", kaldi.Foo_sizex(a))
 
-    print
-    print "-------- Kaldi SBFMReader and MatrixF class example --------"
+    print()
+    print("-------- Kaldi SBFMReader and MatrixF class example --------")
 
     reader = kaldi.SBFMReader_new_char("scp:data.scp")
     
@@ -78,17 +79,16 @@ def decl(f, restype, argtypes):
     feats_numpy_ptr = ctypes.cast(feats.ctypes.data, c_float_ptr)
     kaldi.MatrixF_cpy_to_ptr(feat_value, feats_numpy_ptr, feats.strides[0]/4)
 
-    print "Read utterance:"
-    print "  ID: ", utt_id
-    print "  Rows: ", feat_rows
-    print "  Cols: ", feat_cols
-    print "  Value: ", feat_data
-    print feats
-    print "  This should match data.txt"
+    print("Read utterance:")
+    print("  ID: ", utt_id)
+    print("  Rows: ", feat_rows)
+    print("  Cols: ", feat_cols)
+    print("  Value: ", feat_data)
+    print(feats)
+    print("  This should match data.txt")
 
     # assert no more utterances left
     kaldi.SBFMReader_Next(reader)
     assert(kaldi.SBFMReader_Done(reader))
 
     kaldi.SBFMReader_Delete(reader)
-    
diff --git a/example/speech-demo/tests/test_system.py b/example/speech-demo/tests/test_system.py
index eb4c2b043340..9d2a4b9a0f18 100644
--- a/example/speech-demo/tests/test_system.py
+++ b/example/speech-demo/tests/test_system.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 from pdnn.run_DNN import run_DNN
 from pdnn.run_RBM import run_RBM
 from pdnn.run_SDA import run_SDA
@@ -10,10 +11,10 @@
 setup_logger(None)
 
 def banner(s):
-    print "***********************" + s + "*************************"
+    print("***********************" + s + "*************************")
 
 def test_hi():
-    print "hi"
+    print("hi")
 
 def test_rbm_dnn():
     banner("rbm dnn")
@@ -88,4 +89,4 @@ def test_dropout():
     mnist_conf = MNIST_CONF.copy()
     mnist_conf["train_dnn"]["max_iters"] = MAX_ITERS
     mnist_conf["model"]["dropout_factor"] = "0.4"
-    run_DNN(mnist_conf)
\ No newline at end of file
+    run_DNN(mnist_conf)
diff --git a/example/ssd/README.md b/example/ssd/README.md
index dc7c0fcf82f5..54a6cf0baa7d 100644
--- a/example/ssd/README.md
+++ b/example/ssd/README.md
@@ -11,11 +11,19 @@ The arXiv paper is available [here](http://arxiv.org/abs/1512.02325).
 
 This example is intended for reproducing the nice detector while fully utilize the
 remarkable traits of MXNet.
-* The model is fully compatible with caffe version.
-* The prediction result is almost identical to the original version. However, due to different non-maximum suppression Implementation, the results might differ slightly.
+* Model [converter](#convert-caffemodel) from caffe is available now!
+* The result is almost identical to the original version. However, due to different implementation details, the results might differ slightly.
 
 Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
 
+### What's new
+* Update to the latest version according to caffe version, with 5% mAP increase.
+* Use C++ record iterator based on back-end multi-thread engine to achieve huge speed up on multi-gpu environments.
+* Monitor validation mAP during training.
+* More network symbols under development and test.
+* Extra operators are now in `mxnet/src/operator/contrib`.
+* Old models are incompatible, use [e06c55d](https://github.com/dmlc/mxnet/commits/e06c55d6466a0c98c7def8f118a48060fb868901) or [e4f73f1](https://github.com/dmlc/mxnet/commits/e4f73f1f4e76397992c4b0a33c139d52b4b7af0e) for backward compatibility. Or, you can modify the json file to update the symbols if you are familiar with it, because only names have changed while weights and bias should still be good.
+
 ### Demo results
 ![demo1](https://cloud.githubusercontent.com/assets/3307514/19171057/8e1a0cc4-8be0-11e6-9d8f-088c25353b40.png)
 ![demo2](https://cloud.githubusercontent.com/assets/3307514/19171063/91ec2792-8be0-11e6-983c-773bd6868fa8.png)
@@ -24,7 +32,9 @@ Due to the permission issue, this example is maintained in this [repository](htt
 ### mAP
 |        Model          | Training data    | Test data |  mAP |
 |:-----------------:|:----------------:|:---------:|:----:|
-| VGG16_reduced 300x300 | VOC07+12 trainval| VOC07 test| 71.57|
+| [VGG16_reduced 300x300](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.5-beta/vgg16_ssd_300_voc0712_trainval.zip) | VOC07+12 trainval| VOC07 test| 77.8|
+| [VGG16_reduced 512x512](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.5-beta/vgg16_ssd_512_voc0712_trainval.zip) | VOC07+12 trainval | VOC07 test| 79.9|
+*More to be added*
 
 ### Speed
 |         Model         |   GPU            | CUDNN | Batch-size | FPS* |
@@ -34,40 +44,35 @@ Due to the permission issue, this example is maintained in this [repository](htt
 | VGG16_reduced 300x300 | TITAN X(Maxwell) | v5.1  |     1      | 64   |
 | VGG16_reduced 300x300 | TITAN X(Maxwell) |  N/A  |     8      | 36   |
 | VGG16_reduced 300x300 | TITAN X(Maxwell) |  N/A  |     1      | 28   |
-- *Forward time only, data loading and drawing excluded.*
+*Forward time only, data loading and drawing excluded.*
 
 
 ### Getting started
-* You will need python modules: `easydict`, `cv2`, `matplotlib` and `numpy`.
-You can install them via pip or package managers, such as `apt-get`:
+* You will need python modules: `cv2`, `matplotlib` and `numpy`.
+If you use mxnet-python api, you probably have already got them.
+You can install them via pip or package manegers, such as `apt-get`:
 ```
 sudo apt-get install python-opencv python-matplotlib python-numpy
-sudo pip install easydict
 ```
 
-* Build MXNet: Follow the official instructions, make sure the extra operators for this example is enabled
+* Build MXNet: Follow the official instructions
 ```
 # for Ubuntu/Debian
 cp make/config.mk ./config.mk
-# modify it if with vim or whatever editor
-EXTRA_OPERATORS = example/ssd/operator
-# or add a line if you have other EXTRA_OPERATORS directory
-EXTRA_OPERATORS += example/ssd/operator
+# enable cuda, cudnn if applicable
 ```
 Remember to enable CUDA if you want to be able to train, since CPU training is
-insanely slow. Using CUDNN is optional.
+insanely slow. Using CUDNN is optional, but highly recommanded.
 
 ### Try the demo
-* Download the pretrained model: [`ssd_300.zip`](https://dl.dropboxusercontent.com/u/39265872/ssd_300_vgg16_reduced_voc0712_trainval.zip), and extract to `model/` directory. (This model is converted from VGG_VOC0712_SSD_300x300_iter_60000.caffemodel provided by paper author).
+* Download the pretrained model: [`ssd_300_voc_0712.zip`](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.5-beta/vgg16_ssd_300_voc0712_trainval.zip), and extract to `model/` directory.
 * Run
 ```
-# cd /path/to/mxnet/example/ssd/
-# grab demo images
-python data/demo/download_demo_images.py
-# run demo.py with defaults
+# cd /path/to/mxnet/example/ssd
 python demo.py
 # play with examples:
 python demo.py --epoch 0 --images ./data/demo/dog.jpg --thresh 0.5
+# wait for library to load for the first time
 ```
 * Check `python demo.py --help` for more options.
 
@@ -75,7 +80,7 @@ python demo.py --epoch 0 --images ./data/demo/dog.jpg --thresh 0.5
 This example only covers training on Pascal VOC dataset. Other datasets should
 be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
 See example of `dataset/pascal_voc.py` for details.
-* Download the converted pretrained `vgg16_reduced` model [here](https://dl.dropboxusercontent.com/u/39265872/vgg16_reduced.zip), unzip `.param` and `.json` files
+* Download the converted pretrained `vgg16_reduced` model [here](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.2-beta/vgg16_reduced.zip), unzip `.param` and `.json` files
 into `model/` directory by default.
 * Download the PASCAL VOC dataset, skip this step if you already have one.
 ```
@@ -96,24 +101,29 @@ in the same `VOCdevkit` folder.
 ln -s /path/to/VOCdevkit /path/to/mxnet/example/ssd/data/VOCdevkit
 ```
 Use hard link instead of copy could save us a bit disk space.
+* Create packed binary file for faster training:
+```
+# cd /path/to/mxnet/example/ssd
+bash tools/prepare_pascal.sh
+# or if you are using windows
+python tools/prepare_dataset.py --dataset pascal --year 2007,2012 --set trainval --target ./data/train.lst
+python tools/prepare_dataset.py --dataset pascal --year 2007 --set test --target ./data/val.lst --shuffle False
+```
 * Start training:
 ```
 # cd /path/to/mxnet/example/ssd
 python train.py
 ```
-* By default, this example will use `batch-size=32` and `learning_rate=0.001`.
+* By default, this example will use `batch-size=32` and `learning_rate=0.004`.
 You might need to change the parameters a bit if you have different configurations.
 Check `python train.py --help` for more training options. For example, if you have 4 GPUs, use:
 ```
 # note that a perfect training parameter set is yet to be discovered for multi-GPUs
-python train.py --gpus 0,1,2,3 --batch-size 128 --lr 0.0005
+python train.py --gpus 0,1,2,3 --batch-size 128 --lr 0.001
 ```
-* Memory usage: MXNet is very memory efficient, training on `VGG16_reduced` model with `batch-size` 32 takes around 4684MB without CUDNN.
-* Initial lenarning rate: 0.001 is fine for single GPU. 0.0001 should be used for the first couple of epoches then go back to 0.001 via using parameter --resume.
 
 ### Evalute trained model
-Again, currently we only support evaluation on PASCAL VOC
-Use:
+Make sure you have val.rec as validation dataset. It's the same one as used in training. Use:
 ```
 # cd /path/to/mxnet/example/ssd
 python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0
@@ -125,3 +135,16 @@ Useful when loading python symbol is not available.
 # cd /path/to/mxnet/example/ssd
 python deploy.py --num-class 20
 ```
+
+### Convert caffemodel
+Converter from caffe is available at `/path/to/mxnet/example/ssd/tools/caffe_converter`
+
+This is specifically modified to handle custom layer in caffe-ssd. Usage:
+```
+cd /path/to/mxnet/example/ssd/tools/caffe_converter
+make
+python convert_model.py deploy.prototxt name_of_pretrained_caffe_model.caffemodel ssd_converted
+# you will use this model in deploy mode without loading from python symbol(layer names inconsistent)
+python demo.py --prefix ssd_converted --epoch 1 --deploy
+```
+There is no guarantee that conversion will always work, but at least it's good for now.
diff --git a/example/ssd/config/config.py b/example/ssd/config/config.py
index c72daa3a464e..931ad16f14eb 100644
--- a/example/ssd/config/config.py
+++ b/example/ssd/config/config.py
@@ -1,32 +1,67 @@
 import os
-from easydict import EasyDict as edict
-from tools.rand_sampler import RandCropper, RandPadder
+from utils import DotDict, namedtuple_with_defaults, zip_namedtuple, config_as_dict
 
-cfg = edict()
-cfg.ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
+RandCropper = namedtuple_with_defaults('RandCropper',
+    'min_crop_scales, max_crop_scales, \
+    min_crop_aspect_ratios, max_crop_aspect_ratios, \
+    min_crop_overlaps, max_crop_overlaps, \
+    min_crop_sample_coverages, max_crop_sample_coverages, \
+    min_crop_object_coverages, max_crop_object_coverages, \
+    max_crop_trials',
+    [0.0, 1.0,
+    0.5, 2.0,
+    0.0, 1.0,
+    0.0, 1.0,
+    0.0, 1.0,
+    25])
 
-# training
-cfg.TRAIN = edict()
-cfg.TRAIN.RAND_SAMPLERS = [RandCropper(min_scale=1., max_trials=1, max_sample=1),
-    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.1),
-    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.3),
-    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.5),
-    RandCropper(min_scale=.3, min_aspect_ratio=.5, max_aspect_ratio=2., min_overlap=.7),
-    RandPadder(max_scale=2., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),
-    RandPadder(max_scale=3., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),
-    RandPadder(max_scale=4., min_aspect_ratio=.5, max_aspect_ratio=2., min_gt_scale=.05),]
-# cfg.TRAIN.RAND_SAMPLERS = []
-cfg.TRAIN.RAND_MIRROR = True
-cfg.TRAIN.INIT_SHUFFLE = True
-cfg.TRAIN.EPOCH_SHUFFLE = True # shuffle training list after each epoch
-cfg.TRAIN.RAND_SEED = None
-cfg.TRAIN.RESIZE_EPOCH = 1 # save model every N epoch
+RandPadder = namedtuple_with_defaults('RandPadder',
+    'rand_pad_prob, max_pad_scale, fill_value',
+    [0.0, 1.0, 127])
 
+ColorJitter = namedtuple_with_defaults('ColorJitter',
+    'random_hue_prob, max_random_hue, \
+    random_saturation_prob, max_random_saturation, \
+    random_illumination_prob, max_random_illumination, \
+    random_contrast_prob, max_random_contrast',
+    [0.0, 18,
+    0.0, 32,
+    0.0, 32,
+    0.0, 0.5])
+
+
+cfg = DotDict()
+cfg.ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+
+# training configs
+cfg.train = DotDict()
+# random cropping samplers
+cfg.train.rand_crop_samplers = [
+    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.1),
+    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.3),
+    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.5),
+    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.7),
+    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.9),]
+cfg.train.crop_emit_mode = 'center'
+# cfg.train.emit_overlap_thresh = 0.4
+# random padding
+cfg.train.rand_pad = RandPadder(rand_pad_prob=0.5, max_pad_scale=4.0)
+# random color jitter
+cfg.train.color_jitter = ColorJitter(random_hue_prob=0.5, random_saturation_prob=0.5,
+    random_illumination_prob=0.5, random_contrast_prob=0.5)
+cfg.train.inter_method = 10  # random interpolation
+cfg.train.rand_mirror_prob = 0.5
+cfg.train.shuffle = True
+cfg.train.seed = 233
+cfg.train.preprocess_threads = 6
+cfg.train = config_as_dict(cfg.train)  # convert to normal dict
 
 # validation
-cfg.VALID = edict()
-cfg.VALID.RAND_SAMPLERS = []
-cfg.VALID.RAND_MIRROR = True
-cfg.VALID.INIT_SHUFFLE = True
-cfg.VALID.EPOCH_SHUFFLE = True
-cfg.VALID.RAND_SEED = None
+cfg.valid = DotDict()
+cfg.valid.rand_crop_samplers = []
+cfg.valid.rand_pad = RandPadder()
+cfg.valid.color_jitter = ColorJitter()
+cfg.valid.rand_mirror_prob = 0
+cfg.valid.shuffle = False
+cfg.valid.seed = 0
+cfg.valid = config_as_dict(cfg.valid)  # convert to normal dict
diff --git a/example/ssd/config/utils.py b/example/ssd/config/utils.py
new file mode 100644
index 000000000000..1d66655e8bee
--- /dev/null
+++ b/example/ssd/config/utils.py
@@ -0,0 +1,91 @@
+import collections
+
+class DotDict(dict):
+    """
+    Simple class for dot access elements in dict, support nested initialization
+    Example:
+    d = DotDict({'child': 'dotdict'}, name='dotdict', index=1, contents=['a', 'b'])
+    # add new key
+    d.new_key = '!' # or d['new_key'] = '!'
+    # update values
+    d.new_key = '!!!'
+    # delete keys
+    del d.new_key
+    """
+    def __init__(self, *args, **kwargs):
+        super(DotDict, self).__init__(*args, **kwargs)
+        for arg in args:
+            if isinstance(arg, dict):
+                for k, v in arg.items():
+                    self[k] = v
+
+        if kwargs:
+            for k, v in kwargs.items():
+                self[k] = v
+
+    def __getattr__(self, attr):
+        return self.get(attr)
+
+    def __setattr__(self, key, value):
+        self.__setitem__(key, value)
+
+    def __setitem__(self, key, value):
+        super(DotDict, self).__setitem__(key, value)
+        self.__dict__.update({key: value})
+
+    def __delattr__(self, item):
+        self.__delitem__(item)
+
+    def __delitem__(self, key):
+        super(DotDict, self).__delitem__(key)
+        del self.__dict__[key]
+
+
+def namedtuple_with_defaults(typename, field_names, default_values=()):
+    """ create a namedtuple with default values """
+    T = collections.namedtuple(typename, field_names)
+    T.__new__.__defaults__ = (None, ) * len(T._fields)
+    if isinstance(default_values, collections.Mapping):
+        prototype = T(**default_values)
+    else:
+        prototype = T(*default_values)
+    T.__new__.__defaults__ = tuple(prototype)
+    return T
+
+def merge_dict(a, b):
+    """ merge dict a, b, with b overriding keys in a """
+    c = a.copy()
+    c.update(b)
+    return c
+
+def zip_namedtuple(nt_list):
+    """ accept list of namedtuple, return a dict of zipped fields """
+    if not nt_list:
+        return dict()
+    if not isinstance(nt_list, list):
+        nt_list = [nt_list]
+    for nt in nt_list:
+        assert type(nt) == type(nt_list[0])
+    ret = {k : [v] for k, v in nt_list[0]._asdict().items()}
+    for nt in nt_list[1:]:
+        for k, v in nt._asdict().items():
+            ret[k].append(v)
+    return ret
+
+def config_as_dict(cfg):
+    """ convert raw configuration to unified dictionary """
+    ret = cfg.__dict__.copy()
+    # random cropping params
+    del ret['rand_crop_samplers']
+    assert isinstance(cfg.rand_crop_samplers, list)
+    ret = merge_dict(ret, zip_namedtuple(cfg.rand_crop_samplers))
+    num_crop_sampler = len(cfg.rand_crop_samplers)
+    ret['num_crop_sampler'] = num_crop_sampler  # must specify the #
+    ret['rand_crop_prob'] = 1.0 / (num_crop_sampler + 1) * num_crop_sampler
+    # random padding params
+    del ret['rand_pad']
+    ret = merge_dict(ret, cfg.rand_pad._asdict())
+    # color jitter
+    del ret['color_jitter']
+    ret = merge_dict(ret, cfg.color_jitter._asdict())
+    return ret
diff --git a/example/ssd/dataset/imdb.py b/example/ssd/dataset/imdb.py
index 5687592442d4..95b082d594d9 100644
--- a/example/ssd/dataset/imdb.py
+++ b/example/ssd/dataset/imdb.py
@@ -1,4 +1,5 @@
 import numpy as np
+import os.path as osp
 
 class Imdb(object):
     """
@@ -48,3 +49,32 @@ def label_from_index(self, index):
         numpy.array([id, xmin, ymin, xmax, ymax]...)
         """
         raise NotImplementedError
+
+    def save_imglist(self, fname=None, root=None, shuffle=False):
+        """
+        save imglist to disk
+
+        Parameters:
+        ----------
+        fname : str
+            saved filename
+        """
+        str_list = []
+        for index in range(self.num_images):
+            label = self.label_from_index(index)
+            path = self.image_path_from_index(index)
+            if root:
+                path = osp.relpath(path, root)
+            str_list.append('\t'.join([str(index), str(2), str(label.shape[1])] \
+              + ["{0:.4f}".format(x) for x in label.ravel()] + [path,]) + '\n')
+        if str_list:
+            if shuffle:
+                import random
+                random.shuffle(str_list)
+            if not fname:
+                fname = self.name + '.lst'
+            with open(fname, 'w') as f:
+                for line in str_list:
+                    f.write(line)
+        else:
+            raise RuntimeError("No image in imdb")
diff --git a/example/ssd/dataset/iterator.py b/example/ssd/dataset/iterator.py
index 1a54bd03da8a..5cefece1c147 100644
--- a/example/ssd/dataset/iterator.py
+++ b/example/ssd/dataset/iterator.py
@@ -3,6 +3,109 @@
 import cv2
 from tools.rand_sampler import RandSampler
 
+class DetRecordIter(mx.io.DataIter):
+    """
+    The new detection iterator wrapper for mx.io.ImageDetRecordIter which is
+    written in C++, it takes record file as input and runs faster.
+    Supports various augment operations for object detection.
+
+    Parameters:
+    -----------
+    path_imgrec : str
+        path to the record file
+    path_imglist : str
+        path to the list file to replace the labels in record
+    batch_size : int
+        batch size
+    data_shape : tuple
+        (3, height, width)
+    label_width : int
+        specify the label width, use -1 for variable length
+    label_pad_width : int
+        labels must have same shape in batches, use -1 for automatic estimation
+        in each record, otherwise force padding to width in case you want t
+        rain/validation to match the same width
+    label_pad_value : float
+        label padding value
+    resize_mode : str
+        force - resize to data_shape regardless of aspect ratio
+        fit - try fit to data_shape preserving aspect ratio
+        shrink - shrink to data_shape only, preserving aspect ratio
+    mean_pixels : list or tuple
+        mean values for red/green/blue
+    kwargs : dict
+        see mx.io.ImageDetRecordIter
+
+    Returns:
+    ----------
+
+    """
+    def __init__(self, path_imgrec, batch_size, data_shape, path_imglist="",
+                 label_width=-1, label_pad_width=-1, label_pad_value=-1,
+                 resize_mode='force',  mean_pixels=[123.68, 116.779, 103.939],
+                 **kwargs):
+        super(DetRecordIter, self).__init__()
+        self.rec = mx.io.ImageDetRecordIter(
+            path_imgrec     = path_imgrec,
+            path_imglist    = path_imglist,
+            label_width     = label_width,
+            label_pad_width = label_pad_width,
+            label_pad_value = label_pad_value,
+            batch_size      = batch_size,
+            data_shape      = data_shape,
+            mean_r          = mean_pixels[0],
+            mean_g          = mean_pixels[1],
+            mean_b          = mean_pixels[2],
+            resize_mode     = resize_mode,
+            **kwargs)
+
+        self.provide_label = None
+        self._get_batch()
+        if not self.provide_label:
+            raise RuntimeError("Invalid ImageDetRecordIter: " + path_imgrec)
+        self.reset()
+
+    @property
+    def provide_data(self):
+        return self.rec.provide_data
+
+    def reset(self):
+        self.rec.reset()
+
+    def iter_next(self):
+        return self._get_batch()
+
+    def next(self):
+        if self.iter_next():
+            return self._batch
+        else:
+            raise StopIteration
+
+    def _get_batch(self):
+        self._batch = self.rec.next()
+        if not self._batch:
+            return False
+
+        if self.provide_label is None:
+            # estimate the label shape for the first batch, always reshape to n*5
+            first_label = self._batch.label[0][0].asnumpy()
+            self.batch_size = self._batch.label[0].shape[0]
+            self.label_header_width = int(first_label[4])
+            self.label_object_width = int(first_label[5])
+            assert self.label_object_width >= 5, "object width must >=5"
+            self.label_start = 4 + self.label_header_width
+            self.max_objects = (first_label.size - self.label_start) // self.label_object_width
+            self.label_shape = (self.batch_size, self.max_objects, self.label_object_width)
+            self.label_end = self.label_start + self.max_objects * self.label_object_width
+            self.provide_label = [('label', self.label_shape)]
+
+        # modify label
+        label = self._batch.label[0].asnumpy()
+        label = label[:, self.label_start:self.label_end].reshape(
+            (self.batch_size, self.max_objects, self.label_object_width))
+        self._batch.label = [mx.nd.array(label)]
+        return True
+
 class DetIter(mx.io.DataIter):
     """
     Detection Iterator, which will feed data and label to network
@@ -173,7 +276,7 @@ def _data_augmentation(self, data, label):
         else:
             interp_methods = [cv2.INTER_LINEAR]
         interp_method = interp_methods[int(np.random.uniform(0, 1) * len(interp_methods))]
-        data = mx.img.imresize(data, self._data_shape[0], self._data_shape[1], interp_method)
+        data = mx.img.imresize(data, self._data_shape[1], self._data_shape[0], interp_method)
         if self.is_train and self._rand_mirror:
             if np.random.uniform(0, 1) > 0.5:
                 data = mx.nd.flip(data, axis=1)
diff --git a/example/ssd/dataset/pascal_voc.py b/example/ssd/dataset/pascal_voc.py
index 656007cc9e47..2c61be74e156 100644
--- a/example/ssd/dataset/pascal_voc.py
+++ b/example/ssd/dataset/pascal_voc.py
@@ -40,8 +40,7 @@ def __init__(self, image_set, year, devkit_path, shuffle=False, is_train=False):
                         'sheep', 'sofa', 'train', 'tvmonitor']
 
         self.config = {'use_difficult': True,
-                       'comp_id': 'comp4',
-                       'padding': 56}
+                       'comp_id': 'comp4',}
 
         self.num_classes = len(self.classes)
         self.image_set_index = self._load_image_set_index(shuffle)
@@ -114,7 +113,7 @@ def label_from_index(self, index):
         ground-truths of this image
         """
         assert self.labels is not None, "Labels not processed"
-        return self.labels[index, :, :]
+        return self.labels[index]
 
     def _label_path_from_index(self, index):
         """
@@ -142,7 +141,6 @@ def _load_image_labels(self):
         labels packed in [num_images x max_num_objects x 5] tensor
         """
         temp = []
-        max_objects = 0
 
         # load ground-truth from xml annotations
         for idx in self.image_set_index:
@@ -156,8 +154,8 @@ def _load_image_labels(self):
 
             for obj in root.iter('object'):
                 difficult = int(obj.find('difficult').text)
-                if not self.config['use_difficult'] and difficult == 1:
-                    continue
+                # if not self.config['use_difficult'] and difficult == 1:
+                #     continue
                 cls_name = obj.find('name').text
                 if cls_name not in self.classes:
                     continue
@@ -167,22 +165,9 @@ def _load_image_labels(self):
                 ymin = float(xml_box.find('ymin').text) / height
                 xmax = float(xml_box.find('xmax').text) / width
                 ymax = float(xml_box.find('ymax').text) / height
-                label.append([cls_id, xmin, ymin, xmax, ymax])
+                label.append([cls_id, xmin, ymin, xmax, ymax, difficult])
             temp.append(np.array(label))
-            max_objects = max(max_objects, len(label))
-
-        # add padding to labels so that the dimensions match in each batch
-        # TODO: design a better way to handle label padding
-        assert max_objects > 0, "No objects found for any of the images"
-        assert max_objects <= self.config['padding'], "# obj exceed padding"
-        self.padding = self.config['padding']
-        labels = []
-        for label in temp:
-            label = np.lib.pad(label, ((0, self.padding-label.shape[0]), (0,0)), \
-                               'constant', constant_values=(-1, -1))
-            labels.append(label)
-
-        return np.array(labels)
+        return temp
 
     def evaluate_detections(self, detections):
         """
diff --git a/example/ssd/dataset/testdb.py b/example/ssd/dataset/testdb.py
index 44642d947e8a..7477d77c0aef 100644
--- a/example/ssd/dataset/testdb.py
+++ b/example/ssd/dataset/testdb.py
@@ -48,5 +48,5 @@ def image_path_from_index(self, index):
         assert os.path.exists(name), 'Path does not exist: {}'.format(name)
         return name
 
-    def label_from_inde(self, index):
+    def label_from_index(self, index):
         return RuntimeError("Testdb does not support label loading")
diff --git a/example/ssd/dataset/yolo_format.py b/example/ssd/dataset/yolo_format.py
index e82e5ca1efe2..ce6605f8c637 100644
--- a/example/ssd/dataset/yolo_format.py
+++ b/example/ssd/dataset/yolo_format.py
@@ -102,7 +102,7 @@ def label_from_index(self, index):
         ground-truths of this image
         """
         assert self.labels is not None, "Labels not processed"
-        return self.labels[index, :, :]
+        return self.labels[index]
 
     def _label_path_from_index(self, index):
         """
@@ -130,7 +130,6 @@ def _load_image_labels(self):
         labels packed in [num_images x max_num_objects x 5] tensor
         """
         temp = []
-        max_objects = 0
 
         # load ground-truths
         for idx in self.image_set_index:
@@ -151,13 +150,4 @@ def _load_image_labels(self):
                     ymax = y + half_height
                     label.append([cls_id, xmin, ymin, xmax, ymax])
                 temp.append(np.array(label))
-                max_objects = max(max_objects, len(label))
-        # add padding to labels so that the dimensions match in each batch
-        assert max_objects > 0, "No objects found for any of the images"
-        self.padding = max_objects
-        labels = []
-        for label in temp:
-            label = np.lib.pad(label, ((0, max_objects-label.shape[0]), (0,0)), \
-                               'constant', constant_values=(-1, -1))
-            labels.append(label)
-        return np.array(labels)
+        return temp
diff --git a/example/ssd/demo.py b/example/ssd/demo.py
index ac5945d6da5a..ededbdb59b88 100644
--- a/example/ssd/demo.py
+++ b/example/ssd/demo.py
@@ -35,16 +35,17 @@ def get_detector(net, prefix, epoch, data_shape, mean_pixels, ctx,
         force suppress different categories
     """
     sys.path.append(os.path.join(os.getcwd(), 'symbol'))
-    net = importlib.import_module("symbol_" + net) \
-        .get_symbol(len(CLASSES), nms_thresh, force_nms)
+    if net is not None:
+        net = importlib.import_module("symbol_" + net) \
+            .get_symbol(len(CLASSES), nms_thresh, force_nms)
     detector = Detector(net, prefix + "_" + str(data_shape), epoch, \
         data_shape, mean_pixels, ctx=ctx)
     return detector
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Single-shot detection network demo')
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        choices=['vgg16_reduced'], help='which network to use')
+    parser.add_argument('--network', dest='network', type=str, default='vgg16_ssd_300',
+                        choices=['vgg16_ssd_300', 'vgg16_ssd_512'], help='which network to use')
     parser.add_argument('--images', dest='images', type=str, default='./data/demo/dog.jpg',
                         help='run demo with images, use comma(without space) to seperate multiple images')
     parser.add_argument('--dir', dest='dir', nargs='?',
@@ -75,6 +76,8 @@ def parse_args():
                         help='force non-maximum suppression on different class')
     parser.add_argument('--timer', dest='show_timer', type=bool, default=True,
                         help='show detection time')
+    parser.add_argument('--deploy', dest='deploy_net', action='store_true', default=False,
+                        help='Load network from json file, rather than from symbol')
     args = parser.parse_args()
     return args
 
@@ -89,7 +92,8 @@ def parse_args():
     image_list = [i.strip() for i in args.images.split(',')]
     assert len(image_list) > 0, "No valid image specified to detect"
 
-    detector = get_detector(args.network, args.prefix, args.epoch,
+    network = None if args.deploy_net else args.network
+    detector = get_detector(network, args.prefix, args.epoch,
                             args.data_shape,
                             (args.mean_r, args.mean_g, args.mean_b),
                             ctx, args.nms_thresh, args.force_nms)
diff --git a/example/ssd/deploy.py b/example/ssd/deploy.py
index b755498f9cb4..264314a59f70 100644
--- a/example/ssd/deploy.py
+++ b/example/ssd/deploy.py
@@ -8,8 +8,8 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Convert a trained model to deploy model')
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        choices=['vgg16_reduced'], help='which network to use')
+    parser.add_argument('--network', dest='network', type=str, default='vgg16_ssd_300',
+                        choices=['vgg16_ssd_300', 'vgg16_ssd_512'], help='which network to use')
     parser.add_argument('--epoch', dest='epoch', help='epoch of trained model',
                         default=0, type=int)
     parser.add_argument('--prefix', dest='prefix', help='trained model prefix',
diff --git a/example/ssd/detect/detector.py b/example/ssd/detect/detector.py
index d9ee10c541d7..19b78f63f561 100644
--- a/example/ssd/detect/detector.py
+++ b/example/ssd/detect/detector.py
@@ -31,8 +31,10 @@ def __init__(self, symbol, model_prefix, epoch, data_shape, mean_pixels, \
         self.ctx = ctx
         if self.ctx is None:
             self.ctx = mx.cpu()
-        _, args, auxs = mx.model.load_checkpoint(model_prefix, epoch)
-        self.mod = mx.mod.Module(symbol, context=ctx)
+        load_symbol, args, auxs = mx.model.load_checkpoint(model_prefix, epoch)
+        if symbol is None:
+            symbol = load_symbol
+        self.mod = mx.mod.Module(symbol, label_names=None, context=ctx)
         self.data_shape = data_shape
         self.mod.bind(data_shapes=[('data', (batch_size, 3, data_shape, data_shape))])
         self.mod.set_params(args, auxs)
diff --git a/example/ssd/evaluate.py b/example/ssd/evaluate.py
index a59897f6016a..a38a7f6e6a6e 100644
--- a/example/ssd/evaluate.py
+++ b/example/ssd/evaluate.py
@@ -5,27 +5,33 @@
 import sys
 from evaluate.evaluate_net import evaluate_net
 
+CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat',
+           'bottle', 'bus', 'car', 'cat', 'chair',
+           'cow', 'diningtable', 'dog', 'horse',
+           'motorbike', 'person', 'pottedplant',
+           'sheep', 'sofa', 'train', 'tvmonitor')
+
 def parse_args():
     parser = argparse.ArgumentParser(description='Evaluate a network')
-    parser.add_argument('--dataset', dest='dataset', help='which dataset to use',
-                        default='pascal', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--eval-set', dest='eval_set', type=str, default='test',
-                        help='evaluation set')
-    parser.add_argument('--devkit-path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        choices=['vgg16_reduced'], help='which network to use')
+    parser.add_argument('--rec-path', dest='rec_path', help='which record file to use',
+                        default=os.path.join(os.getcwd(), 'data', 'val.rec'), type=str)
+    parser.add_argument('--list-path', dest='list_path', help='which list file to use',
+                        default="", type=str)
+    parser.add_argument('--network', dest='network', type=str, default='vgg16_ssd_300',
+                        choices=['vgg16_ssd_300', 'vgg16_ssd_512'], help='which network to use')
     parser.add_argument('--batch-size', dest='batch_size', type=int, default=32,
                         help='evaluation batch size')
+    parser.add_argument('--num-class', dest='num_class', type=int, default=20,
+                        help='number of classes')
+    parser.add_argument('--class-names', dest='class_names', type=str, default=",".join(CLASSES),
+                        help='string of comma separated names, or text filename')
     parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
                         default=0, type=int)
     parser.add_argument('--prefix', dest='prefix', help='load model prefix',
                         default=os.path.join(os.getcwd(), 'model', 'ssd'), type=str)
     parser.add_argument('--gpus', dest='gpu_id', help='GPU devices to evaluate with',
                         default='0', type=str)
-    parser.add_argument('--cpu', dest='cpu', help='use cpu to evaluate',
+    parser.add_argument('--cpu', dest='cpu', help='use cpu to evaluate, this can be slow',
                         action='store_true')
     parser.add_argument('--data-shape', dest='data_shape', type=int, default=300,
                         help='set image shape')
@@ -37,19 +43,46 @@ def parse_args():
                         help='blue mean value')
     parser.add_argument('--nms', dest='nms_thresh', type=float, default=0.45,
                         help='non-maximum suppression threshold')
+    parser.add_argument('--overlap', dest='overlap_thresh', type=float, default=0.5,
+                        help='evaluation overlap threshold')
     parser.add_argument('--force', dest='force_nms', type=bool, default=False,
                         help='force non-maximum suppression on different class')
+    parser.add_argument('--use-difficult', dest='use_difficult', type=bool, default=False,
+                        help='use difficult ground-truths in evaluation')
+    parser.add_argument('--voc07', dest='use_voc07_metric', type=bool, default=True,
+                        help='use PASCAL VOC 07 metric')
+    parser.add_argument('--deploy', dest='deploy_net', help='Load network from model',
+                        action='store_true', default=False)
     args = parser.parse_args()
     return args
 
 if __name__ == '__main__':
     args = parse_args()
+    # choose ctx
     if args.cpu:
         ctx = mx.cpu()
     else:
         ctx = [mx.gpu(int(i)) for i in args.gpu_id.split(',')]
-    evaluate_net(args.network, args.dataset, args.devkit_path,
+    # parse # classes and class_names if applicable
+    num_class = args.num_class
+    if len(args.class_names) > 0:
+        if os.path.isfile(args.class_names):
+                # try to open it to read class names
+                with open(args.class_names, 'r') as f:
+                    class_names = [l.strip() for l in f.readlines()]
+        else:
+            class_names = [c.strip() for c in args.class_names.split(',')]
+        assert len(class_names) == num_class
+        for name in class_names:
+            assert len(name) > 0
+    else:
+        class_names = None
+
+    network = None if args.deploy_net else args.network
+    evaluate_net(network, args.rec_path, num_class,
                  (args.mean_r, args.mean_g, args.mean_b), args.data_shape,
-                 args.prefix, args.epoch, ctx, year=args.year,
-                 sets=args.eval_set, batch_size=args.batch_size,
-                 nms_thresh=args.nms_thresh, force_nms=args.force_nms)
+                 args.prefix, args.epoch, ctx, batch_size=args.batch_size,
+                 path_imglist=args.list_path, nms_thresh=args.nms_thresh,
+                 force_nms=args.force_nms, ovp_thresh=args.overlap_thresh,
+                 use_difficult=args.use_difficult, class_names=class_names,
+                 voc07_metric=args.use_voc07_metric)
diff --git a/example/ssd/evaluate/eval_metric.py b/example/ssd/evaluate/eval_metric.py
new file mode 100644
index 000000000000..f475bb336ddb
--- /dev/null
+++ b/example/ssd/evaluate/eval_metric.py
@@ -0,0 +1,257 @@
+import mxnet as mx
+import numpy as np
+
+class MApMetric(mx.metric.EvalMetric):
+    """
+    Calculate mean AP for object detection task
+
+    Parameters:
+    ---------
+    ovp_thresh : float
+        overlap threshold for TP
+    use_difficult : boolean
+        use difficult ground-truths if applicable, otherwise just ignore
+    class_names : list of str
+        optional, if provided, will print out AP for each class
+    pred_idx : int
+        prediction index in network output list
+    """
+    def __init__(self, ovp_thresh=0.5, use_difficult=False, class_names=None, pred_idx=0):
+        if class_names is None:
+            super(MApMetric, self).__init__("mAP")
+        else:
+            assert isinstance(class_names, list)
+            for name in class_names:
+                assert isinstance(name, str), "must provide names as str"
+            num = len(class_names)
+            super(MApMetric, self).__init__(class_names + ["mAP"], num + 1)
+        self.records = dict()
+        self.counts = dict()
+        self.ovp_thresh = ovp_thresh
+        self.use_difficult = use_difficult
+        self.class_names = class_names
+        self.pred_idx = int(pred_idx)
+
+    def reset(self):
+        """Clear the internal statistics to initial state."""
+        super(MApMetric, self).reset()
+        self.records = dict()
+        self.counts = dict()
+
+    def get(self):
+        """Get the current evaluation result.
+
+        Returns
+        -------
+        name : str
+           Name of the metric.
+        value : float
+           Value of the evaluation.
+        """
+        self._update()  # update metric at this time
+        if self.num is None:
+            if self.num_inst == 0:
+                return (self.name, float('nan'))
+            else:
+                return (self.name, self.sum_metric / self.num_inst)
+        else:
+            names = ['%s'%(self.name[i]) for i in range(self.num)]
+            values = [x / y if y != 0 else float('nan') \
+                for x, y in zip(self.sum_metric, self.num_inst)]
+            return (names, values)
+
+    def update(self, labels, preds):
+        """
+        Update internal records. This function now only update internal buffer,
+        sum_metric and num_inst are updated in _update() function instead when
+        get() is called to return results.
+
+        Params:
+        ----------
+        labels: mx.nd.array (n * 6) or (n * 5), difficult column is optional
+            2-d array of ground-truths, n objects(id-xmin-ymin-xmax-ymax-[difficult])
+        preds: mx.nd.array (m * 6)
+            2-d array of detections, m objects(id-score-xmin-ymin-xmax-ymax)
+        """
+        def iou(x, ys):
+            """
+            Calculate intersection-over-union overlap
+            Params:
+            ----------
+            x : numpy.array
+                single box [xmin, ymin ,xmax, ymax]
+            ys : numpy.array
+                multiple box [[xmin, ymin, xmax, ymax], [...], ]
+            Returns:
+            -----------
+            numpy.array
+                [iou1, iou2, ...], size == ys.shape[0]
+            """
+            ixmin = np.maximum(ys[:, 0], x[0])
+            iymin = np.maximum(ys[:, 1], x[1])
+            ixmax = np.minimum(ys[:, 2], x[2])
+            iymax = np.minimum(ys[:, 3], x[3])
+            iw = np.maximum(ixmax - ixmin, 0.)
+            ih = np.maximum(iymax - iymin, 0.)
+            inters = iw * ih
+            uni = (x[2] - x[0]) * (x[3] - x[1]) + (ys[:, 2] - ys[:, 0]) * \
+                (ys[:, 3] - ys[:, 1]) - inters
+            ious = inters / uni
+            ious[uni < 1e-12] = 0  # in case bad boxes
+            return ious
+
+        # independant execution for each image
+        for i in range(labels[0].shape[0]):
+            # get as numpy arrays
+            label = labels[0][i].asnumpy()
+            pred = preds[self.pred_idx][i].asnumpy()
+            # calculate for each class
+            while (pred.shape[0] > 0):
+                cid = int(pred[0, 0])
+                indices = np.where(pred[:, 0].astype(int) == cid)[0]
+                if cid < 0:
+                    pred = np.delete(pred, indices, axis=0)
+                    continue
+                dets = pred[indices]
+                pred = np.delete(pred, indices, axis=0)
+                # sort by score, desceding
+                dets[dets[:,1].argsort()[::-1]]
+                records = np.hstack((dets[:, 1][:, np.newaxis], np.zeros((dets.shape[0], 1))))
+                # ground-truths
+                gts = label[np.where(label[:, 0].astype(int) == cid)[0], :]
+                if gts.size > 0:
+                    found = [False] * gts.shape[0]
+                    for j in range(dets.shape[0]):
+                        # compute overlaps
+                        ious = iou(dets[j, 2:], gts[:, 1:5])
+                        ovargmax = np.argmax(ious)
+                        ovmax = ious[ovargmax]
+                        if ovmax > self.ovp_thresh:
+                            if (not self.use_difficult and
+                                gts.shape[1] >= 6 and
+                                gts[ovargmax, 5] > 0):
+                                pass
+                            else:
+                                if not found[ovargmax]:
+                                    records[j, -1] = 1  # tp
+                                    found[ovargmax] = True
+                                else:
+                                    # duplicate
+                                    records[j, -1] = 2  # fp
+                        else:
+                            records[j, -1] = 2 # fp
+                else:
+                    # no gt, mark all fp
+                    records[:, -1] = 2
+
+                # ground truth count
+                if (not self.use_difficult and gts.shape[1] >= 6):
+                    gt_count = np.sum(gts[:, 5] < 1)
+                else:
+                    gt_count = gts.shape[0]
+
+                # now we push records to buffer
+                # first column: score, second column: tp/fp
+                # 0: not set(matched to difficult or something), 1: tp, 2: fp
+                records = records[np.where(records[:, -1] > 0)[0], :]
+                if records.size > 0:
+                    self._insert(cid, records, gt_count)
+
+    def _update(self):
+        """ update num_inst and sum_metric """
+        aps = []
+        for k, v in self.records.items():
+            recall, prec = self._recall_prec(v, self.counts[k])
+            ap = self._average_precision(recall, prec)
+            aps.append(ap)
+            if self.num is not None and k < (self.num - 1):
+                self.sum_metric[k] = ap
+                self.num_inst[k] = 1
+        if self.num is None:
+            self.num_inst = 1
+            self.sum_metric = np.mean(aps)
+        else:
+            self.num_inst[-1] = 1
+            self.sum_metric[-1] = np.mean(aps)
+
+    def _recall_prec(self, record, count):
+        """ get recall and precision from internal records """
+        sorted_records = record[record[:,0].argsort()[::-1]]
+        tp = np.cumsum(sorted_records[:, 1].astype(int) == 1)
+        fp = np.cumsum(sorted_records[:, 1].astype(int) == 2)
+        if count <= 0:
+            recall = tp * 0.0
+        else:
+            recall = tp / float(count)
+        prec = tp.astype(float) / (tp + fp)
+        return recall, prec
+
+    def _average_precision(self, rec, prec):
+        """
+        calculate average precision
+
+        Params:
+        ----------
+        rec : numpy.array
+            cumulated recall
+        prec : numpy.array
+            cumulated precision
+        Returns:
+        ----------
+        ap as float
+        """
+        # append sentinel values at both ends
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute precision integration ladder
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # look for recall value changes
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # sum (\delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+        return ap
+
+    def _insert(self, key, records, count):
+        """ Insert records according to key """
+        if key not in self.records:
+            assert key not in self.counts
+            self.records[key] = records
+            self.counts[key] = count
+        else:
+            self.records[key] = np.vstack((self.records[key], records))
+            assert key in self.counts
+            self.counts[key] += count
+
+
+class VOC07MApMetric(MApMetric):
+    """ Mean average precision metric for PASCAL V0C 07 dataset """
+    def __init__(self, *args, **kwargs):
+        super(VOC07MApMetric, self).__init__(*args, **kwargs)
+
+    def _average_precision(self, rec, prec):
+        """
+        calculate average precision, override the default one,
+        special 11-point metric
+
+        Params:
+        ----------
+        rec : numpy.array
+            cumulated recall
+        prec : numpy.array
+            cumulated precision
+        Returns:
+        ----------
+        ap as float
+        """
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap += p / 11.
+        return ap
diff --git a/example/ssd/evaluate/eval_voc.py b/example/ssd/evaluate/eval_voc.py
index f8f92e1fd6a9..f4744d19f8f3 100644
--- a/example/ssd/evaluate/eval_voc.py
+++ b/example/ssd/evaluate/eval_voc.py
@@ -51,8 +51,8 @@ def voc_ap(rec, prec, use_07_metric=False):
             ap += p / 11.
     else:
         # append sentinel values at both ends
-        mrec = np.concatenate([0.], rec, [1.])
-        mpre = np.concatenate([0.], prec, [0.])
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
 
         # compute precision integration ladder
         for i in range(mpre.size - 1, 0, -1):
diff --git a/example/ssd/evaluate/evaluate_net.py b/example/ssd/evaluate/evaluate_net.py
index 4ed7d39e0228..8d86f8eefd56 100644
--- a/example/ssd/evaluate/evaluate_net.py
+++ b/example/ssd/evaluate/evaluate_net.py
@@ -1,63 +1,94 @@
+from __future__ import print_function
 import os
 import sys
 import importlib
-from dataset.pascal_voc import PascalVoc
-from dataset.iterator import DetIter
-from detect.detector import Detector
+import mxnet as mx
+from dataset.iterator import DetRecordIter
 from config.config import cfg
+from evaluate.eval_metric import MApMetric, VOC07MApMetric
 import logging
 
-def evaluate_net(net, dataset, devkit_path, mean_pixels, data_shape,
-                 model_prefix, epoch, ctx, year=None, sets='test',
-                 batch_size=1, nms_thresh=0.5, force_nms=False):
+def evaluate_net(net, path_imgrec, num_classes, mean_pixels, data_shape,
+                 model_prefix, epoch, ctx=mx.cpu(), batch_size=1,
+                 path_imglist="", nms_thresh=0.45, force_nms=False,
+                 ovp_thresh=0.5, use_difficult=False, class_names=None,
+                 voc07_metric=False):
     """
-    Evaluate entire dataset, basically simple wrapper for detections
+    evalute network given validation record file
 
     Parameters:
-    ---------
-    dataset : str
-        name of dataset to evaluate
-    devkit_path : str
-        root directory of dataset
-    mean_pixels : tuple of float
-        (R, G, B) mean pixel values
-    data_shape : int
-        resize input data shape
+    ----------
+    net : str or None
+        Network name or use None to load from json without modifying
+    path_imgrec : str
+        path to the record validation file
+    path_imglist : str
+        path to the list file to replace labels in record file, optional
+    num_classes : int
+        number of classes, not including background
+    mean_pixels : tuple
+        (mean_r, mean_g, mean_b)
+    data_shape : tuple or int
+        (3, height, width) or height/width
     model_prefix : str
-        load model prefix
+        model prefix of saved checkpoint
     epoch : int
         load model epoch
     ctx : mx.ctx
-        running context, mx.cpu() or mx.gpu(0)...
-    year : str or None
-        evaluate on which year's data
-    sets : str
-        evaluation set
+        mx.gpu() or mx.cpu()
     batch_size : int
-        using batch_size for evaluation
+        validation batch size
     nms_thresh : float
         non-maximum suppression threshold
-    force_nms : bool
-        force suppress different categories
+    force_nms : boolean
+        whether suppress different class objects
+    ovp_thresh : float
+        AP overlap threshold for true/false postives
+    use_difficult : boolean
+        whether to use difficult objects in evaluation if applicable
+    class_names : comma separated str
+        class names in string, must correspond to num_classes if set
+    voc07_metric : boolean
+        whether to use 11-point evluation as in VOC07 competition
     """
     # set up logger
     logging.basicConfig()
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
 
-    if dataset == "pascal":
-        if not year:
-            year = '2007'
-        imdb = PascalVoc(sets, year, devkit_path, shuffle=False, is_train=False)
-        data_iter = DetIter(imdb, batch_size, data_shape, mean_pixels,
-            rand_samplers=[], rand_mirror=False, is_train=False, shuffle=False)
+    # args
+    if isinstance(data_shape, int):
+        data_shape = (3, data_shape, data_shape)
+    assert len(data_shape) == 3 and data_shape[0] == 3
+    model_prefix += '_' + str(data_shape[1])
+
+    # iterator
+    eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape,
+                              path_imglist=path_imglist, **cfg.valid)
+    # model params
+    load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch)
+    # network
+    if net is None:
+        net = load_net
+    else:
         sys.path.append(os.path.join(cfg.ROOT_DIR, 'symbol'))
         net = importlib.import_module("symbol_" + net) \
-            .get_symbol(imdb.num_classes, nms_thresh, force_nms)
-        model_prefix += "_" + str(data_shape)
-        detector = Detector(net, model_prefix, epoch, data_shape, mean_pixels, batch_size, ctx)
-        logger.info("Start evaluation with {} images, be patient...".format(imdb.num_images))
-        detections = detector.detect(data_iter)
-        imdb.evaluate_detections(detections)
+            .get_symbol(num_classes, nms_thresh, force_nms)
+    if not 'label' in net.list_arguments():
+        label = mx.sym.Variable(name='label')
+        net = mx.sym.Group([net, label])
+
+    # init module
+    mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx,
+        fixed_param_names=net.list_arguments())
+    mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label)
+    mod.set_params(args, auxs, allow_missing=False, force_init=True)
+
+    # run evaluation
+    if voc07_metric:
+        metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names)
     else:
-        raise NotImplementedError("No support for dataset: " + dataset)
+        metric = MApMetric(ovp_thresh, use_difficult, class_names)
+    results = mod.score(eval_iter, metric, num_batch=None)
+    for k, v in results:
+        print("{}: {}".format(k, v))
diff --git a/example/ssd/symbol/common.py b/example/ssd/symbol/common.py
index 968e2c3cafad..12ea71826e22 100644
--- a/example/ssd/symbol/common.py
+++ b/example/ssd/symbol/common.py
@@ -30,7 +30,9 @@ def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
     (conv, relu) mx.Symbols
     """
     assert not use_batchnorm, "batchnorm not yet supported"
-    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
+    bias = mx.symbol.Variable(name="conv{}_bias".format(name),
+        init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
+    conv = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=kernel, pad=pad, \
         stride=stride, num_filter=num_filter, name="conv{}".format(name))
     relu = mx.symbol.Activation(data=conv, act_type=act_type, \
         name="{}{}".format(act_type, name))
@@ -40,7 +42,7 @@ def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
 
 def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
                     ratios=[1], normalization=-1, num_channels=[],
-                    clip=True, interm_layer=0):
+                    clip=True, interm_layer=0, steps=[]):
     """
     the basic aggregation module for SSD detection. Takes in multiple layers,
     generate multiple object detection targets by customized layers
@@ -66,6 +68,9 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
         whether to clip out-of-image boxes
     interm_layer : int
         if > 0, will add a intermediate Convolution layer
+    steps : list
+        specify steps for each MultiBoxPrior layer, leave empty, it will calculate
+        according to layer dimensions
 
     Returns:
     ----------
@@ -104,6 +109,9 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
     assert sum(x > 0 for x in normalization) == len(num_channels), \
         "must provide number of channels for each normalized layer"
 
+    if steps:
+        assert len(steps) == len(from_layers), "provide steps for all layers or leave empty"
+
     loc_pred_layers = []
     cls_pred_layers = []
     anchor_layers = []
@@ -115,15 +123,10 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
         if normalization[k] > 0:
             from_layer = mx.symbol.L2Normalization(data=from_layer, \
                 mode="channel", name="{}_norm".format(from_name))
-            # from_layer = mx.symbol.NaiveScale(data=from_layer, mode="spatial", \
-            #     name="{}_scale_{}".format(from_name, normalization[k]))
-            # scale = mx.symbol.InferedVariable(data=from_layer, shape=(1, 0, 1, 1),
-            #     suffix="{}_scale".format(normalization[k]),
-            #     name="{}_scale".format(from_name))
             scale = mx.symbol.Variable(name="{}_scale".format(from_name),
-                shape=(1, num_channels.pop(0), 1, 1))
-            # scale = mx.symbol.Reshape(data=scale, shape=(1, 512, 1, 1))
-            from_layer = normalization[k] * mx.symbol.broadcast_mul(lhs=scale, rhs=from_layer)
+                shape=(1, num_channels.pop(0), 1, 1),
+                init=mx.init.Constant(normalization[k]))
+            from_layer = mx.symbol.broadcast_mul(lhs=scale, rhs=from_layer)
         if interm_layer > 0:
             from_layer = mx.symbol.Convolution(data=from_layer, kernel=(3,3), \
                 stride=(1,1), pad=(1,1), num_filter=interm_layer, \
@@ -144,7 +147,9 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
 
         # create location prediction layer
         num_loc_pred = num_anchors * 4
-        loc_pred = mx.symbol.Convolution(data=from_layer, kernel=(3,3), \
+        bias = mx.symbol.Variable(name="{}_loc_pred_conv_bias".format(from_name),
+            init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
+        loc_pred = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=(3,3), \
             stride=(1,1), pad=(1,1), num_filter=num_loc_pred, \
             name="{}_loc_pred_conv".format(from_name))
         loc_pred = mx.symbol.transpose(loc_pred, axes=(0,2,3,1))
@@ -153,7 +158,9 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
 
         # create class prediction layer
         num_cls_pred = num_anchors * num_classes
-        cls_pred = mx.symbol.Convolution(data=from_layer, kernel=(3,3), \
+        bias = mx.symbol.Variable(name="{}_cls_pred_conv_bias".format(from_name),
+            init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
+        cls_pred = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=(3,3), \
             stride=(1,1), pad=(1,1), num_filter=num_cls_pred, \
             name="{}_cls_pred_conv".format(from_name))
         cls_pred = mx.symbol.transpose(cls_pred, axes=(0,2,3,1))
@@ -161,8 +168,12 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
         cls_pred_layers.append(cls_pred)
 
         # create anchor generation layer
-        anchors = mx.symbol.MultiBoxPrior(from_layer, sizes=size_str, ratios=ratio_str, \
-            clip=clip, name="{}_anchors".format(from_name))
+        if steps:
+            step = (steps[k], steps[k])
+        else:
+            step = '(-1.0, -1.0)'
+        anchors = mx.contrib.symbol.MultiBoxPrior(from_layer, sizes=size_str, ratios=ratio_str, \
+            clip=clip, name="{}_anchors".format(from_name), steps=step)
         anchors = mx.symbol.Flatten(data=anchors)
         anchor_layers.append(anchors)
 
diff --git a/example/ssd/symbol/symbol_vgg16_reduced.py b/example/ssd/symbol/symbol_vgg16_ssd_300.py
similarity index 80%
rename from example/ssd/symbol/symbol_vgg16_reduced.py
rename to example/ssd/symbol/symbol_vgg16_ssd_300.py
index ce06e89d4886..e40674661300 100644
--- a/example/ssd/symbol/symbol_vgg16_reduced.py
+++ b/example/ssd/symbol/symbol_vgg16_ssd_300.py
@@ -2,7 +2,7 @@
 from common import conv_act_layer
 from common import multibox_layer
 
-def get_symbol_train(num_classes=20):
+def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400):
     """
     Single-shot multi-box detection with VGG 16 layers ConvNet
     This is a modified version, with fc6/fc7 layers replaced by conv layers
@@ -13,6 +13,12 @@ def get_symbol_train(num_classes=20):
     ----------
     num_classes: int
         number of object classes not including background
+    nms_thresh : float
+        non-maximum suppression threshold
+    force_suppress : boolean
+        whether suppress different class objects
+    nms_topk : int
+        apply NMS to top K detections
 
     Returns:
     ----------
@@ -100,25 +106,27 @@ def get_symbol_train(num_classes=20):
         stride=(2,2), act_type="relu", use_batchnorm=False)
     conv10_1, relu10_1 = conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \
         stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv10_2, relu10_2 = conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    # global Pooling
-    pool10 = mx.symbol.Pooling(data=relu10_2, pool_type="avg",
-        global_pool=True, kernel=(1,1), name='pool10')
+    conv10_2, relu10_2 = conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv11_1, relu11_1 = conv_act_layer(relu10_2, "11_1", 128, kernel=(1,1), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv11_2, relu11_2 = conv_act_layer(relu11_1, "11_2", 256, kernel=(3,3), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
 
     # specific parameters for VGG16 network
-    from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, pool10]
-    sizes = [[.1], [.2,.276], [.38, .461], [.56, .644], [.74, .825], [.92, 1.01]]
+    from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, relu11_2]
+    sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
     ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-        [1,2,.5,3,1./3], [1,2,.5,3,1./3]]
+        [1,2,.5], [1,2,.5]]
     normalizations = [20, -1, -1, -1, -1, -1]
+    steps = [ x / 300.0 for x in [8, 16, 32, 64, 100, 300]]
     num_channels = [512]
 
     loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
-        num_channels=num_channels, clip=True, interm_layer=0)
+        num_channels=num_channels, clip=False, interm_layer=0, steps=steps)
 
-    tmp = mx.symbol.MultiBoxTarget(
+    tmp = mx.contrib.symbol.MultiBoxTarget(
         *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
         ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
         negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
@@ -128,7 +136,7 @@ def get_symbol_train(num_classes=20):
     cls_target = tmp[2]
 
     cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \
-        ignore_label=-1, use_ignore=True, grad_scale=3., multi_output=True, \
+        ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \
         normalization='valid', name="cls_prob")
     loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \
         data=loc_target_mask * (loc_preds - loc_target), scalar=1.0)
@@ -137,12 +145,16 @@ def get_symbol_train(num_classes=20):
 
     # monitoring training status
     cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
+    det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
+        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
+    det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
 
     # group output
-    out = mx.symbol.Group([cls_prob, loc_loss, cls_label])
+    out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det])
     return out
 
-def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=True):
+def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400):
     """
     Single-shot multi-box detection with VGG 16 layers ConvNet
     This is a modified version, with fc6/fc7 layers replaced by conv layers
@@ -155,22 +167,23 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=True):
         number of object classes not including background
     nms_thresh : float
         threshold of overlap for non-maximum suppression
+    force_suppress : boolean
+        whether suppress different class objects
+    nms_topk : int
+        apply NMS to top K detections
 
     Returns:
     ----------
     mx.Symbol
     """
     net = get_symbol_train(num_classes)
-    # print net.get_internals().list_outputs()
     cls_preds = net.get_internals()["multibox_cls_pred_output"]
     loc_preds = net.get_internals()["multibox_loc_pred_output"]
     anchor_boxes = net.get_internals()["multibox_anchors_output"]
 
     cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
         name='cls_prob')
-    # group output
-    # out = mx.symbol.Group([loc_preds, cls_preds, anchor_boxes])
-    out = mx.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2))
+        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     return out
diff --git a/example/ssd/symbol/symbol_vgg16_ssd_512.py b/example/ssd/symbol/symbol_vgg16_ssd_512.py
new file mode 100644
index 000000000000..1cc243b0406b
--- /dev/null
+++ b/example/ssd/symbol/symbol_vgg16_ssd_512.py
@@ -0,0 +1,194 @@
+import mxnet as mx
+from common import conv_act_layer
+from common import multibox_layer
+
+def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400):
+    """
+    Single-shot multi-box detection with VGG 16 layers ConvNet
+    This is a modified version, with fc6/fc7 layers replaced by conv layers
+    And the network is slightly smaller than original VGG 16 network
+    This is a training network with losses
+
+    Parameters:
+    ----------
+    num_classes: int
+        number of object classes not including background
+    nms_thresh : float
+        non-maximum suppression threshold
+    force_suppress : boolean
+        whether suppress different class objects
+    nms_topk : int
+        apply NMS to top K detections
+
+    Returns:
+    ----------
+    mx.Symbol
+    """
+    data = mx.symbol.Variable(name="data")
+    label = mx.symbol.Variable(name="label")
+
+    # group 1
+    conv1_1 = mx.symbol.Convolution(
+        data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
+    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
+    conv1_2 = mx.symbol.Convolution(
+        data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
+    relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
+    pool1 = mx.symbol.Pooling(
+        data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")
+    # group 2
+    conv2_1 = mx.symbol.Convolution(
+        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
+    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
+    conv2_2 = mx.symbol.Convolution(
+        data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
+    relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
+    pool2 = mx.symbol.Pooling(
+        data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")
+    # group 3
+    conv3_1 = mx.symbol.Convolution(
+        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
+    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
+    conv3_2 = mx.symbol.Convolution(
+        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
+    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
+    conv3_3 = mx.symbol.Convolution(
+        data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
+    relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
+    pool3 = mx.symbol.Pooling(
+        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \
+        pooling_convention="full", name="pool3")
+    # group 4
+    conv4_1 = mx.symbol.Convolution(
+        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
+    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
+    conv4_2 = mx.symbol.Convolution(
+        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
+    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
+    conv4_3 = mx.symbol.Convolution(
+        data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
+    relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
+    pool4 = mx.symbol.Pooling(
+        data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")
+    # group 5
+    conv5_1 = mx.symbol.Convolution(
+        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
+    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
+    conv5_2 = mx.symbol.Convolution(
+        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
+    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
+    conv5_3 = mx.symbol.Convolution(
+        data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
+    relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
+    pool5 = mx.symbol.Pooling(
+        data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1),
+        pad=(1,1), name="pool5")
+    # group 6
+    conv6 = mx.symbol.Convolution(
+        data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6),
+        num_filter=1024, name="conv6")
+    relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6")
+    # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
+    # group 7
+    conv7 = mx.symbol.Convolution(
+        data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7")
+    relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7")
+    # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
+
+    ### ssd extra layers ###
+    conv8_1, relu8_1 = conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv8_2, relu8_2 = conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \
+        stride=(2,2), act_type="relu", use_batchnorm=False)
+    conv9_1, relu9_1 = conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv9_2, relu9_2 = conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \
+        stride=(2,2), act_type="relu", use_batchnorm=False)
+    conv10_1, relu10_1 = conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv10_2, relu10_2 = conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(1,1), \
+        stride=(2,2), act_type="relu", use_batchnorm=False)
+    conv11_1, relu11_1 = conv_act_layer(relu10_2, "11_1", 128, kernel=(1,1), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv11_2, relu11_2 = conv_act_layer(relu11_1, "11_2", 256, kernel=(3,3), pad=(1,1), \
+        stride=(2,2), act_type="relu", use_batchnorm=False)
+    conv12_1, relu12_1 = conv_act_layer(relu11_2, "12_1", 128, kernel=(1,1), pad=(0,0), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+    conv12_2, relu12_2 = conv_act_layer(relu12_1, "12_2", 256, kernel=(4,4), pad=(1,1), \
+        stride=(1,1), act_type="relu", use_batchnorm=False)
+
+    # specific parameters for VGG16 network
+    from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, relu11_2, relu12_2]
+    sizes = [[.07, .1025], [.15,.2121], [.3, .3674], [.45, .5196], [.6, .6708], \
+        [.75, .8216], [.9, .9721]]
+    ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
+        [1,2,.5,3,1./3], [1,2,.5], [1,2,.5]]
+    normalizations = [20, -1, -1, -1, -1, -1, -1]
+    steps = [ x / 512.0 for x in [8, 16, 32, 64, 128, 256, 512]]
+    num_channels = [512]
+
+    loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \
+        num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
+        num_channels=num_channels, clip=False, interm_layer=0, steps=steps)
+
+    tmp = mx.contrib.symbol.MultiBoxTarget(
+        *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
+        ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
+        negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
+        name="multibox_target")
+    loc_target = tmp[0]
+    loc_target_mask = tmp[1]
+    cls_target = tmp[2]
+
+    cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \
+        ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \
+        normalization='valid', name="cls_prob")
+    loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \
+        data=loc_target_mask * (loc_preds - loc_target), scalar=1.0)
+    loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \
+        normalization='valid', name="loc_loss")
+
+    # monitoring training status
+    cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
+    det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
+        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
+    det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
+
+    # group output
+    out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det])
+    return out
+
+def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400):
+    """
+    Single-shot multi-box detection with VGG 16 layers ConvNet
+    This is a modified version, with fc6/fc7 layers replaced by conv layers
+    And the network is slightly smaller than original VGG 16 network
+    This is the detection network
+
+    Parameters:
+    ----------
+    num_classes: int
+        number of object classes not including background
+    nms_thresh : float
+        threshold of overlap for non-maximum suppression
+    force_suppress : boolean
+        whether suppress different class objects
+    nms_topk : int
+        apply NMS to top K detections
+
+    Returns:
+    ----------
+    mx.Symbol
+    """
+    net = get_symbol_train(num_classes)
+    cls_preds = net.get_internals()["multibox_cls_pred_output"]
+    loc_preds = net.get_internals()["multibox_loc_pred_output"]
+    anchor_boxes = net.get_internals()["multibox_anchors_output"]
+
+    cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
+        name='cls_prob')
+    out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
+        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
+    return out
diff --git a/example/ssd/tools/caffe_converter/Makefile b/example/ssd/tools/caffe_converter/Makefile
new file mode 100644
index 000000000000..71b8b26e0575
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/Makefile
@@ -0,0 +1,17 @@
+# find protoc
+ifndef PROTOC
+DEPS_PROTOC=../../deps/bin/protoc
+ifneq ("$(wildcard $(DEPS_PROTOC))","")
+PROTOC = $(DEPS_PROTOC)
+else
+PROTOC = protoc
+endif
+endif
+
+all: caffe_parse/caffe_pb2.py
+
+clean:
+	rm caffe_parse/caffe_pb2.py*
+
+caffe_parse/caffe_pb2.py:
+	$(PROTOC) --python_out=./ ./caffe_parse/caffe.proto
diff --git a/example/ssd/tools/caffe_converter/README.md b/example/ssd/tools/caffe_converter/README.md
new file mode 100644
index 000000000000..446ba4ed32f1
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/README.md
@@ -0,0 +1,31 @@
+# Convert Caffe Model to Mxnet Format
+
+### Build (Linux)
+
+Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.html) or [Google protobuf](https://developers.google.com/protocol-buffers/?hl=en) is required. The latter is often much easier to install:  
+
+1. We first install the protobuf compiler. If you compiled mxnet with `USE_DIST_KVSTORE = 1` then it is already built. Otherwise, install `protobuf-compiler` by your favor package manager, e.g. `sudo apt-get install protobuf-compiler` for ubuntu and `sudo yum install protobuf-compiler` for redhat/fedora.
+
+2. Then install the protobuf's python binding. For example `sudo pip install protobuf`
+
+Now we can build the tool by running `make` in the current directory.
+
+### Build (Windows)
+
+Note: this tool currently only works on python 2.
+
+We must make sure that the installed python binding and protobuf compiler are using the same version of protobuf,
+so we install the bindings first, and then install the corresponding compiler.
+
+1. Install the protobuf bindings. At time of writing, the conda package manager has the most up to date version. Either run `conda install -c conda-forge protobuf` or `pip install protobuf`
+2. Download the win32 build of protoc from [Protocol Buffers Releases](https://github.com/google/protobuf/releases). Make sure to download the version that corresponds to the version of the bindings. Extract to any location then add that location to your `PATH`
+3. Run `make_win32.bat` to build the package
+
+
+### How to use
+To convert ssd caffemodels, Use: `python convert_model.py prototxt caffemodel outputprefix`
+
+
+### Note
+
+Use this converter for ssd caffemodels only. General converter is available in `mxnet/tools/caffe_converter`.
diff --git a/tools/caffe_converter/caffe_parse/__init__.py b/example/ssd/tools/caffe_converter/caffe_parse/__init__.py
similarity index 100%
rename from tools/caffe_converter/caffe_parse/__init__.py
rename to example/ssd/tools/caffe_converter/caffe_parse/__init__.py
diff --git a/example/ssd/tools/caffe_converter/caffe_parse/caffe.proto b/example/ssd/tools/caffe_converter/caffe_parse/caffe.proto
new file mode 100644
index 000000000000..d7d66dd8bdb1
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/caffe_parse/caffe.proto
@@ -0,0 +1,1921 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape {
+  repeated int64 dim = 1 [packed = true];
+}
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [packed = true];
+  repeated float diff = 6 [packed = true];
+  repeated double double_data = 8 [packed = true];
+  repeated double double_diff = 9 [packed = true];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [default = 0];
+  optional int32 channels = 2 [default = 0];
+  optional int32 height = 3 [default = 0];
+  optional int32 width = 4 [default = 0];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector {
+  repeated BlobProto blobs = 1;
+}
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [default = false];
+}
+
+// The label (display) name and label id.
+message LabelMapItem {
+  // Both name and label are required.
+  optional string name = 1;
+  optional int32 label = 2;
+  // display_name is optional.
+  optional string display_name = 3;
+}
+
+message LabelMap {
+  repeated LabelMapItem item = 1;
+}
+
+// Sample a bbox in the normalized space [0, 1] with provided constraints.
+message Sampler {
+  // Minimum scale of the sampled bbox.
+  optional float min_scale = 1 [default = 1.];
+  // Maximum scale of the sampled bbox.
+  optional float max_scale = 2 [default = 1.];
+
+  // Minimum aspect ratio of the sampled bbox.
+  optional float min_aspect_ratio = 3 [default = 1.];
+  // Maximum aspect ratio of the sampled bbox.
+  optional float max_aspect_ratio = 4 [default = 1.];
+}
+
+// Constraints for selecting sampled bbox.
+message SampleConstraint {
+  // Minimum Jaccard overlap between sampled bbox and all bboxes in
+  // AnnotationGroup.
+  optional float min_jaccard_overlap = 1;
+  // Maximum Jaccard overlap between sampled bbox and all bboxes in
+  // AnnotationGroup.
+  optional float max_jaccard_overlap = 2;
+
+  // Minimum coverage of sampled bbox by all bboxes in AnnotationGroup.
+  optional float min_sample_coverage = 3;
+  // Maximum coverage of sampled bbox by all bboxes in AnnotationGroup.
+  optional float max_sample_coverage = 4;
+
+  // Minimum coverage of all bboxes in AnnotationGroup by sampled bbox.
+  optional float min_object_coverage = 5;
+  // Maximum coverage of all bboxes in AnnotationGroup by sampled bbox.
+  optional float max_object_coverage = 6;
+}
+
+// Sample a batch of bboxes with provided constraints.
+message BatchSampler {
+  // Use original image as the source for sampling.
+  optional bool use_original_image = 1 [default = true];
+
+  // Constraints for sampling bbox.
+  optional Sampler sampler = 2;
+
+  // Constraints for determining if a sampled bbox is positive or negative.
+  optional SampleConstraint sample_constraint = 3;
+
+  // If provided, break when found certain number of samples satisfing the
+  // sample_constraint.
+  optional uint32 max_sample = 4;
+
+  // Maximum number of trials for sampling to avoid infinite loop.
+  optional uint32 max_trials = 5 [default = 100];
+}
+
+// Condition for emitting annotations.
+message EmitConstraint {
+  enum EmitType {
+    CENTER = 0;
+    MIN_OVERLAP = 1;
+  }
+  optional EmitType emit_type = 1 [default = CENTER];
+  // If emit_type is MIN_OVERLAP, provide the emit_overlap.
+  optional float emit_overlap = 2;
+}
+
+// The normalized bounding box [0, 1] w.r.t. the input image size.
+message NormalizedBBox {
+  optional float xmin = 1;
+  optional float ymin = 2;
+  optional float xmax = 3;
+  optional float ymax = 4;
+  optional int32 label = 5;
+  optional bool difficult = 6;
+  optional float score = 7;
+  optional float size = 8;
+}
+
+// Annotation for each object instance.
+message Annotation {
+  optional int32 instance_id = 1 [default = 0];
+  optional NormalizedBBox bbox = 2;
+}
+
+// Group of annotations for a particular label.
+message AnnotationGroup {
+  optional int32 group_label = 1;
+  repeated Annotation annotation = 2;
+}
+
+// An extension of Datum which contains "rich" annotations.
+message AnnotatedDatum {
+  enum AnnotationType {
+    BBOX = 0;
+  }
+  optional Datum datum = 1;
+  // If there are "rich" annotations, specify the type of annotation.
+  // Currently it only supports bounding box.
+  // If there are no "rich" annotations, use label in datum instead.
+  optional AnnotationType type = 2;
+  // Each group contains annotation for a particular class.
+  repeated AnnotationGroup annotation_group = 3;
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [default = 'constant'];
+  optional float value = 2 [default = 0]; // the value in constant filler
+  optional float min = 3 [default = 0]; // the min value in uniform filler
+  optional float max = 4 [default = 1]; // the max value in uniform filler
+  optional float mean = 5 [default = 0]; // the mean value in Gaussian filler
+  optional float std = 6 [default = 1]; // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [default = -1];
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [default = FAN_IN];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // DEPRECATED. See InputParameter. The input blobs to the network.
+  repeated string input = 3;
+  // DEPRECATED. See InputParameter. The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "input_shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [default = false];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [default = false];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100;  // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 44 (last added: plateau_winsize)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2; // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22; // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, all states will have solver = true;
+  // train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // Evaluation type.
+  optional string eval_type = 41 [default = "classification"];
+  // ap_version: different ways of computing Average Precision.
+  //    Check https://sanchom.wordpress.com/tag/average-precision/ for details.
+  //    11point: the 11-point interpolated average precision. Used in VOC2007.
+  //    MaxIntegral: maximally interpolated AP. Used in VOC2012/ILSVRC.
+  //    Integral: the natural integral of the precision-recall curve.
+  optional string ap_version = 42 [default = "Integral"];
+  // If true, display per class result.
+  optional bool show_per_class_result = 44 [default = false];
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [default = 0];
+  optional bool test_compute_loss = 19 [default = false];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [default = true];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [default = 1];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [default = 1];
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //    - plateau: decreases lr
+  //              if the minimum loss isn't updated for 'plateau_winsize' iters
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
+  optional float gamma = 9; // The parameter to compute the learning rate.
+  optional float power = 10; // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [default = "L2"];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+  // the stepsize for learning rate policy "plateau"
+  repeated int32 plateau_winsize = 43;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [default = -1];
+
+  optional int32 snapshot = 14 [default = 0]; // The snapshot interval
+  optional string snapshot_prefix = 15; // The prefix for the snapshot.
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [default = false];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [default = BINARYPROTO];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [default = GPU];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [default = 0];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [default = -1];
+
+  // type of the solver
+  optional string type = 40 [default = "SGD"];
+
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
+  optional float delta = 31 [default = 1e-8];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [default = 0.999];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38 [default = 0.99];
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [default = true];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [default = SGD];
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1; // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3; // The history for sgd solvers
+  optional int32 current_step = 4 [default = 0]; // The current step for learning rate
+  optional float minimum_loss = 5 [default = 1E38]; // Historical minimum loss
+  optional int32 iter_last_event = 6 [default = 0]; // The iteration when last lr-update or min_loss-update happend
+}
+
+enum Phase {
+   TRAIN = 0;
+   TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [default = TEST];
+  optional int32 level = 2 [default = 0];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [default = 1.0];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [default = 1.0];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 147 (last added: recurrent_param)
+message LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4; // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
+  // The size must be either 0 or equal to the number of bottoms.
+  repeated bool propagate_down = 11;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional AnnotatedDataParameter annotated_data_param = 200;
+  optional ArgMaxParameter argmax_param = 103;
+  optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional CropParameter crop_param = 144;
+  optional DataParameter data_param = 107;
+  optional DetectionEvaluateParameter detection_evaluate_param = 205;
+  optional DetectionOutputParameter detection_output_param = 204;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
+  optional EmbedParameter embed_param = 137;
+  optional ExpParameter exp_param = 111;
+  optional FlattenParameter flatten_param = 135;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional InputParameter input_param = 143;
+  optional LogParameter log_param = 134;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MultiBoxLossParameter multibox_loss_param = 201;
+  optional MVNParameter mvn_param = 120;
+  optional NormalizeParameter norm_param = 206;
+  optional ParameterParameter parameter_param = 145;
+  optional PermuteParameter permute_param = 202;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PriorBoxParameter prior_box_param = 203;
+  optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
+  optional ReductionParameter reduction_param = 136;
+  optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
+  optional ScaleParameter scale_param = 142;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
+  optional SliceParameter slice_param = 126;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
+  optional VideoDataParameter video_data_param = 207;
+  optional WindowDataParameter window_data_param = 129;
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [default = 1];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [default = false];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [default = 0];
+  optional uint32 crop_h = 11 [default = 0];
+  optional uint32 crop_w = 12 [default = 0];
+
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would substract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [default = false];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [default = false];
+  // Resize policy
+  optional ResizeParameter resize_param = 8;
+  // Noise policy
+  optional NoiseParameter noise_param = 9;
+  // Distortion policy
+  optional DistortionParameter distort_param = 13;
+  // Expand policy
+  optional ExpansionParameter expand_param = 14;
+  // Constraint for emitting the annotation after transformation.
+  optional EmitConstraint emit_constraint = 10;
+}
+
+// Message that stores parameters used by data transformer for resize policy
+message ResizeParameter {
+  //Probability of using this resize policy
+  optional float prob = 1 [default = 1];
+
+  enum Resize_mode {
+    WARP = 1;
+    FIT_SMALL_SIZE = 2;
+    FIT_LARGE_SIZE_AND_PAD = 3;
+  }
+  optional Resize_mode resize_mode = 2 [default = WARP];
+  optional uint32 height = 3 [default = 0];
+  optional uint32 width = 4 [default = 0];
+  // A parameter used to update bbox in FIT_SMALL_SIZE mode.
+  optional uint32 height_scale = 8 [default = 0];
+  optional uint32 width_scale = 9 [default = 0];
+
+  enum Pad_mode {
+    CONSTANT = 1;
+    MIRRORED = 2;
+    REPEAT_NEAREST = 3;
+  }
+  // Padding mode for BE_SMALL_SIZE_AND_PAD mode and object centering
+  optional Pad_mode pad_mode = 5 [default = CONSTANT];
+  // if specified can be repeated once (would fill all the channels)
+  // or can be repeated the same number of times as channels
+  // (would use it them to the corresponding channel)
+  repeated float pad_value = 6;
+
+  enum Interp_mode { //Same as in OpenCV
+    LINEAR = 1;
+    AREA = 2;
+    NEAREST = 3;
+    CUBIC = 4;
+    LANCZOS4 = 5;
+  }
+  //interpolation for for resizing
+  repeated Interp_mode interp_mode = 7;
+}
+
+message SaltPepperParameter {
+  //Percentage of pixels
+  optional float fraction = 1 [default = 0];
+  repeated float value = 2;
+}
+
+// Message that stores parameters used by data transformer for transformation
+// policy
+message NoiseParameter {
+  //Probability of using this resize policy
+  optional float prob = 1 [default = 0];
+  // Histogram equalized
+  optional bool hist_eq = 2 [default = false];
+  // Color inversion
+  optional bool inverse = 3 [default = false];
+  // Grayscale
+  optional bool decolorize = 4 [default = false];
+  // Gaussian blur
+  optional bool gauss_blur = 5 [default = false];
+
+  // JPEG compression quality (-1 = no compression)
+  optional float jpeg = 6 [default = -1];
+
+  // Posterization
+  optional bool posterize = 7 [default = false];
+
+  // Erosion
+  optional bool erode = 8 [default = false];
+
+  // Salt-and-pepper noise
+  optional bool saltpepper = 9 [default = false];
+
+  optional SaltPepperParameter saltpepper_param = 10;
+
+  // Local histogram equalization
+  optional bool clahe = 11 [default = false];
+
+  // Color space conversion
+  optional bool convert_to_hsv = 12 [default = false];
+
+  // Color space conversion
+  optional bool convert_to_lab = 13 [default = false];
+}
+
+// Message that stores parameters used by data transformer for distortion policy
+message DistortionParameter {
+  // The probability of adjusting brightness.
+  optional float brightness_prob = 1 [default = 0.0];
+  // Amount to add to the pixel values within [-delta, delta].
+  // The possible value is within [0, 255]. Recommend 32.
+  optional float brightness_delta = 2 [default = 0.0];
+
+  // The probability of adjusting contrast.
+  optional float contrast_prob = 3 [default = 0.0];
+  // Lower bound for random contrast factor. Recommend 0.5.
+  optional float contrast_lower = 4 [default = 0.0];
+  // Upper bound for random contrast factor. Recommend 1.5.
+  optional float contrast_upper = 5 [default = 0.0];
+
+  // The probability of adjusting hue.
+  optional float hue_prob = 6 [default = 0.0];
+  // Amount to add to the hue channel within [-delta, delta].
+  // The possible value is within [0, 180]. Recommend 36.
+  optional float hue_delta = 7 [default = 0.0];
+
+  // The probability of adjusting saturation.
+  optional float saturation_prob = 8 [default = 0.0];
+  // Lower bound for the random saturation factor. Recommend 0.5.
+  optional float saturation_lower = 9 [default = 0.0];
+  // Upper bound for the random saturation factor. Recommend 1.5.
+  optional float saturation_upper = 10 [default = 0.0];
+
+  // The probability of randomly order the image channels.
+  optional float random_order_prob = 11 [default = 0.0];
+}
+
+// Message that stores parameters used by data transformer for expansion policy
+message ExpansionParameter {
+  //Probability of using this expansion policy
+  optional float prob = 1 [default = 1];
+
+  // The ratio to expand the image.
+  optional float max_expand_ratio = 2 [default = 1.];
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  // For historical reasons, the default normalization for
+  // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
+  optional NormalizationMode normalization = 3 [default = VALID];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
+}
+
+// Messages that store parameters used by individual layer types follow, in
+// alphabetical order.
+
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [default = 1];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [default = 1];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+message AnnotatedDataParameter {
+  // Define the sampler.
+  repeated BatchSampler batch_sampler = 1;
+  // Store label name and label id in LabelMap format.
+  optional string label_map_file = 2;
+  // If provided, it will replace the AnnotationType stored in each
+  // AnnotatedDatum.
+  optional AnnotatedDatum.AnnotationType anno_type = 3;
+}
+
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [default = false];
+  optional uint32 top_k = 2 [default = 1];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [default = 1];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [default = 1];
+}
+
+message BatchNormParameter {
+  // If false, accumulate global mean/variance values via a moving average. If
+  // true, use those accumulated values instead of computing mean/variance
+  // across the batch.
+  optional bool use_global_stats = 1;
+  // How much does the moving average decay each iteration?
+  optional float moving_average_fraction = 2 [default = .999];
+  // Small value to add to the variance estimate so that we don't divide by
+  // zero.
+  optional float eps = 3 [default = 1e-5];
+}
+
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
+message ContrastiveLossParameter {
+  // margin for dissimilar pair
+  optional float margin = 1 [default = 1.0];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [default = false];
+}
+
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3; // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6; // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [default = 0]; // The padding height (2D only)
+  optional uint32 pad_w = 10 [default = 0]; // The padding width (2D only)
+  optional uint32 kernel_h = 11; // The kernel height (2D only)
+  optional uint32 kernel_w = 12; // The kernel width (2D only)
+  optional uint32 stride_h = 13; // The stride height (2D only)
+  optional uint32 stride_w = 14; // The stride width (2D only)
+
+  optional uint32 group = 5 [default = 1]; // The group size for group conv
+
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8; // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [default = DEFAULT];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [default = 1];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [default = false];
+}
+
+message CropParameter {
+  // To crop, elements of the first bottom are selected to fit the dimensions
+  // of the second, reference bottom. The crop is configured by
+  // - the crop `axis` to pick the dimensions for cropping
+  // - the crop `offset` to set the shift for all/each dimension
+  // to align the cropped bottom with the reference bottom.
+  // All dimensions up to but excluding `axis` are preserved, while
+  // the dimensions including and trailing `axis` are cropped.
+  // If only one `offset` is set, then all dimensions are offset by this amount.
+  // Otherwise, the number of offsets must equal the number of cropped axes to
+  // shift the crop in each dimension accordingly.
+  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
+  // and `axis` may be negative to index from the end (e.g., -1 for the last
+  // axis).
+  optional int32 axis = 1 [default = 2];
+  repeated uint32 offset = 2;
+}
+
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  optional DB backend = 8 [default = LEVELDB];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [default = false];
+  // Prefetch queue (Number of batches to prefetch to host memory, increase if
+  // data access bandwidth varies).
+  optional uint32 prefetch = 10 [default = 4];
+}
+
+// Message that store parameters used by DetectionEvaluateLayer
+message DetectionEvaluateParameter {
+  // Number of classes that are actually predicted. Required!
+  optional uint32 num_classes = 1;
+  // Label id for background class. Needed for sanity check so that
+  // background class is neither in the ground truth nor the detections.
+  optional uint32 background_label_id = 2 [default = 0];
+  // Threshold for deciding true/false positive.
+  optional float overlap_threshold = 3 [default = 0.5];
+  // If true, also consider difficult ground truth for evaluation.
+  optional bool evaluate_difficult_gt = 4 [default = true];
+  // A file which contains a list of names and sizes with same order
+  // of the input DB. The file is in the following format:
+  //    name height width
+  //    ...
+  // If provided, we will scale the prediction and ground truth NormalizedBBox
+  // for evaluation.
+  optional string name_size_file = 5;
+  // The resize parameter used in converting NormalizedBBox to original image.
+  optional ResizeParameter resize_param = 6;
+}
+
+message NonMaximumSuppressionParameter {
+  // Threshold to be used in nms.
+  optional float nms_threshold = 1 [default = 0.3];
+  // Maximum number of results to be kept.
+  optional int32 top_k = 2;
+  // Parameter for adaptive nms.
+  optional float eta = 3 [default = 1.0];
+}
+
+message SaveOutputParameter {
+  // Output directory. If not empty, we will save the results.
+  optional string output_directory = 1;
+  // Output name prefix.
+  optional string output_name_prefix = 2;
+  // Output format.
+  //    VOC - PASCAL VOC output format.
+  //    COCO - MS COCO output format.
+  optional string output_format = 3;
+  // If you want to output results, must also provide the following two files.
+  // Otherwise, we will ignore saving results.
+  // label map file.
+  optional string label_map_file = 4;
+  // A file which contains a list of names and sizes with same order
+  // of the input DB. The file is in the following format:
+  //    name height width
+  //    ...
+  optional string name_size_file = 5;
+  // Number of test images. It can be less than the lines specified in
+  // name_size_file. For example, when we only want to evaluate on part
+  // of the test images.
+  optional uint32 num_test_image = 6;
+  // The resize parameter used in saving the data.
+  optional ResizeParameter resize_param = 7;
+}
+
+// Message that store parameters used by DetectionOutputLayer
+message DetectionOutputParameter {
+  // Number of classes to be predicted. Required!
+  optional uint32 num_classes = 1;
+  // If true, bounding box are shared among different classes.
+  optional bool share_location = 2 [default = true];
+  // Background label id. If there is no background class,
+  // set it as -1.
+  optional int32 background_label_id = 3 [default = 0];
+  // Parameters used for non maximum suppression.
+  optional NonMaximumSuppressionParameter nms_param = 4;
+  // Parameters used for saving detection results.
+  optional SaveOutputParameter save_output_param = 5;
+  // Type of coding method for bbox.
+  optional PriorBoxParameter.CodeType code_type = 6 [default = CORNER];
+  // If true, variance is encoded in target; otherwise we need to adjust the
+  // predicted offset accordingly.
+  optional bool variance_encoded_in_target = 8 [default = false];
+  // Number of total bboxes to be kept per image after nms step.
+  // -1 means keeping all bboxes after nms step.
+  optional int32 keep_top_k = 7 [default = -1];
+  // Only consider detections whose confidences are larger than a threshold.
+  // If not provided, consider all boxes.
+  optional float confidence_threshold = 9;
+  // If true, visualize the detection results.
+  optional bool visualize = 10 [default = false];
+  // The threshold used to visualize the detection results.
+  optional float visualize_threshold = 11;
+  // If provided, save outputs to video file.
+  optional string save_file = 12;
+}
+
+message DropoutParameter {
+  optional float dropout_ratio = 1 [default = 0.5]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [default = SUM]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [default = true];
+}
+
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [default = 1];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [default = true]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4; // The filler for the weight
+  optional FillerParameter bias_filler = 5; // The filler for the bias
+
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+/// Message that stores parameters used by FlattenLayer
+message FlattenParameter {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [default = 1];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [default = -1];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [default = false];
+}
+
+message HDF5OutputParameter {
+  optional string file_name = 1;
+}
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [default = L1];
+}
+
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [default = 1];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [default = 0];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [default = false];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [default = 0];
+  optional uint32 new_width = 10 [default = 0];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [default = true];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly mirror
+  // data.
+  optional bool mirror = 6 [default = false];
+  optional string root_folder = 12 [default = ""];
+}
+
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+}
+
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3; // The filler for the weight
+  optional FillerParameter bias_filler = 4; // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [default = 1];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [default = false];
+}
+
+message InputParameter {
+  // This layer produces N >= 1 top blob(s) to be assigned manually.
+  // Define N shapes to set a shape for each top.
+  // Define 1 shape to set the same shape for every top.
+  // Define no shape to defer to reshaping manually.
+  repeated BlobShape shape = 1;
+}
+
+// Message that stores parameters used by LogLayer
+message LogParameter {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [default = -1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [default = 5];
+  optional float alpha = 2 [default = 1.];
+  optional float beta = 3 [default = 0.75];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [default = ACROSS_CHANNELS];
+  optional float k = 5 [default = 1.];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+// Message that store parameters used by MultiBoxLossLayer
+message MultiBoxLossParameter {
+  // Localization loss type.
+  enum LocLossType {
+    L2 = 0;
+    SMOOTH_L1 = 1;
+  }
+  optional LocLossType loc_loss_type = 1 [default = SMOOTH_L1];
+  // Confidence loss type.
+  enum ConfLossType {
+    SOFTMAX = 0;
+    LOGISTIC = 1;
+  }
+  optional ConfLossType conf_loss_type = 2 [default = SOFTMAX];
+  // Weight for localization loss.
+  optional float loc_weight = 3 [default = 1.0];
+  // Number of classes to be predicted. Required!
+  optional uint32 num_classes = 4;
+  // If true, bounding box are shared among different classes.
+  optional bool share_location = 5 [default = true];
+  // Matching method during training.
+  enum MatchType {
+    BIPARTITE = 0;
+    PER_PREDICTION = 1;
+  }
+  optional MatchType match_type = 6 [default = PER_PREDICTION];
+  // If match_type is PER_PREDICTION, use overlap_threshold to
+  // determine the extra matching bboxes.
+  optional float overlap_threshold = 7 [default = 0.5];
+  // Use prior for matching.
+  optional bool use_prior_for_matching = 8 [default = true];
+  // Background label id.
+  optional uint32 background_label_id = 9 [default = 0];
+  // If true, also consider difficult ground truth.
+  optional bool use_difficult_gt = 10 [default = true];
+  // If true, perform negative mining.
+  // DEPRECATED: use mining_type instead.
+  optional bool do_neg_mining = 11;
+  // The negative/positive ratio.
+  optional float neg_pos_ratio = 12 [default = 3.0];
+  // The negative overlap upperbound for the unmatched predictions.
+  optional float neg_overlap = 13 [default = 0.5];
+  // Type of coding method for bbox.
+  optional PriorBoxParameter.CodeType code_type = 14 [default = CORNER];
+  // If true, encode the variance of prior box in the loc loss target instead of
+  // in bbox.
+  optional bool encode_variance_in_target = 16 [default = false];
+  // If true, map all object classes to agnostic class. It is useful for learning
+  // objectness detector.
+  optional bool map_object_to_agnostic = 17 [default = false];
+  // If true, ignore cross boundary bbox during matching.
+  // Cross boundary bbox is a bbox who is outside of the image region.
+  optional bool ignore_cross_boundary_bbox = 18 [default = false];
+  // If true, only backpropagate on corners which are inside of the image
+  // region when encode_type is CORNER or CORNER_SIZE.
+  optional bool bp_inside = 19 [default = false];
+  // Mining type during training.
+  //   NONE : use all negatives.
+  //   MAX_NEGATIVE : select negatives based on the score.
+  //   HARD_EXAMPLE : select hard examples based on "Training Region-based Object Detectors with Online Hard Example Mining", Shrivastava et.al.
+  enum MiningType {
+    NONE = 0;
+    MAX_NEGATIVE = 1;
+    HARD_EXAMPLE = 2;
+  }
+  optional MiningType mining_type = 20 [default = MAX_NEGATIVE];
+  // Parameters used for non maximum suppression durig hard example mining.
+  optional NonMaximumSuppressionParameter nms_param = 21;
+  optional int32 sample_size = 22 [default = 64];
+  optional bool use_prior_for_nms = 23 [default = false];
+}
+
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [default = true];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [default = false];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [default = 1e-9];
+}
+
+// Message that stores parameters used by NormalizeLayer
+message NormalizeParameter {
+  optional bool across_spatial = 1 [default = true];
+  // Initial value of scale. Default is 1.0 for all
+  optional FillerParameter scale_filler = 2;
+  // Whether or not scale parameters are shared across channels.
+  optional bool channel_shared = 3 [default = true];
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 4 [default = 1e-10];
+}
+
+message ParameterParameter {
+  optional BlobShape shape = 1;
+}
+
+message PermuteParameter {
+  // The new orders of the axes of data. Notice it should be with
+  // in the same range as the input data, and it starts from 0.
+  // Do not provide repeated order.
+  repeated uint32 order = 1;
+}
+
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [default = MAX]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [default = 0]; // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [default = 0]; // The padding height
+  optional uint32 pad_w = 10 [default = 0]; // The padding width
+  optional uint32 kernel_size = 2; // The kernel size (square)
+  optional uint32 kernel_h = 5; // The kernel height
+  optional uint32 kernel_w = 6; // The kernel width
+  optional uint32 stride = 3 [default = 1]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7; // The stride height
+  optional uint32 stride_w = 8; // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [default = DEFAULT];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [default = false];
+}
+
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [default = 1.0];
+  optional float scale = 2 [default = 1.0];
+  optional float shift = 3 [default = 0.0];
+}
+
+// Message that store parameters used by PriorBoxLayer
+message PriorBoxParameter {
+  // Encode/decode type.
+  enum CodeType {
+    CORNER = 1;
+    CENTER_SIZE = 2;
+    CORNER_SIZE = 3;
+  }
+  // Minimum box size (in pixels). Required!
+  repeated float min_size = 1;
+  // Maximum box size (in pixels). Required!
+  repeated float max_size = 2;
+  // Various of aspect ratios. Duplicate ratios will be ignored.
+  // If none is provided, we use default ratio 1.
+  repeated float aspect_ratio = 3;
+  // If true, will flip each aspect ratio.
+  // For example, if there is aspect ratio "r",
+  // we will generate aspect ratio "1.0/r" as well.
+  optional bool flip = 4 [default = true];
+  // If true, will clip the prior so that it is within [0, 1]
+  optional bool clip = 5 [default = false];
+  // Variance for adjusting the prior bboxes.
+  repeated float variance = 6;
+  // By default, we calculate img_height, img_width, step_x, step_y based on
+  // bottom[0] (feat) and bottom[1] (img). Unless these values are explicitely
+  // provided.
+  // Explicitly provide the img_size.
+  optional uint32 img_size = 7;
+  // Either img_size or img_h/img_w should be specified; not both.
+  optional uint32 img_h = 8;
+  optional uint32 img_w = 9;
+
+  // Explicitly provide the step size.
+  optional float step = 10;
+  // Either step or step_h/step_w should be specified; not both.
+  optional float step_h = 11;
+  optional float step_w = 12;
+
+  // Offset to the top left corner of each cell.
+  optional float offset = 13 [default = 0.5];
+}
+
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [default = ''];
+  // Whether this PythonLayer is shared among worker solvers during data parallelism.
+  // If true, each worker solver sequentially run forward from this layer.
+  // This value should be set true if you are using it as a data layer.
+  optional bool share_in_parallel = 4 [default = false];
+}
+
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [default = 0];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3; // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [default = false];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [default = false];
+}
+
+// Message that stores parameters used by ReductionLayer
+message ReductionParameter {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [default = SUM]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [default = 0];
+
+  optional float coeff = 3 [default = 1.0]; // coefficient for output
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [default = 0];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [default = DEFAULT];
+}
+
+message ReshapeParameter {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim:  0  dim:-1  dim:  4 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [default = 0];
+  optional int32 num_axes = 3 [default = -1];
+}
+
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [default = 1];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [default = 1];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [default = false];
+  optional FillerParameter bias_filler = 5;
+}
+
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [default = 1];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [default = 1];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [default = 1];
+}
+
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [default = DEFAULT];
+}
+
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [default = 1];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [default = 0]; // Strictly positive values
+}
+
+message VideoDataParameter{
+  enum VideoType {
+    WEBCAM = 0;
+    VIDEO = 1;
+  }
+  optional VideoType video_type = 1 [default = WEBCAM];
+  optional int32 device_id = 2 [default = 0];
+  optional string video_file = 3;
+  // Number of frames to be skipped before processing a frame.
+  optional uint32 skip_frames = 4 [default = 0];
+}
+
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [default = 1];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [default = 0];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [default = false];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [default = 0.5];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [default = 0.25];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [default = 0];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [default = "warp"];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [default = false];
+  // append root_folder to locate images
+  optional string root_folder = 13 [default = ""];
+}
+
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [default = MAX]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [default = DEFAULT];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [default = true]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5; // The filler for the weight
+  optional FillerParameter bias_filler = 6; // The filler for the bias
+
+  optional uint32 pad = 7 [default = 0]; // The padding size
+  optional uint32 kernelsize = 8; // The kernel size
+  optional uint32 group = 9 [default = 1]; // The group size for group conv
+  optional uint32 stride = 10 [default = 1]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [default = MAX]; // The pooling method
+  optional float dropout_ratio = 12 [default = 0.5]; // dropout ratio
+
+  optional uint32 local_size = 13 [default = 5]; // for local response norm
+  optional float alpha = 14 [default = 1.]; // for local response norm
+  optional float beta = 15 [default = 0.75]; // for local response norm
+  optional float k = 22 [default = 1.];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [default = 1];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [default = 0];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [default = false];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [default = 0];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [default = 0.5];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [default = 0.5];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [default = 0.25];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [default = 0];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [default = "warp"];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [default = 0];
+  optional int32 new_channels = 61 [default = 0];
+  optional int32 new_height = 62 [default = 0];
+  optional int32 new_width = 63 [default = 0];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [default = false];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [default = 1];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope paramters are shared across channels.
+  optional bool channel_shared = 2 [default = false];
+}
diff --git a/tools/caffe_converter/caffe_parse/parse_from_protobuf.py b/example/ssd/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
similarity index 100%
rename from tools/caffe_converter/caffe_parse/parse_from_protobuf.py
rename to example/ssd/tools/caffe_converter/caffe_parse/parse_from_protobuf.py
diff --git a/example/ssd/tools/caffe_converter/convert_model.py b/example/ssd/tools/caffe_converter/convert_model.py
new file mode 100644
index 000000000000..a06b655b53d9
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/convert_model.py
@@ -0,0 +1,135 @@
+from __future__ import print_function
+import sys
+import os.path as osp
+sys.path.insert(0, osp.join(osp.dirname(__file__), '..'))
+import find_mxnet
+import mxnet as mx
+import numpy as np
+import argparse
+import re
+from convert_symbol import proto2symbol
+
+caffe_flag = True
+try:
+    import caffe
+except ImportError:
+    import caffe_parse.parse_from_protobuf as parse
+
+    caffe_flag = False
+
+
+def get_caffe_iter(layer_names, layers):
+    for layer_idx, layer in enumerate(layers):
+        layer_name = re.sub('[-/]', '_', layer_names[layer_idx])
+        layer_type = layer.type
+        layer_blobs = layer.blobs
+        yield (layer_name, layer_type, layer_blobs)
+
+
+def get_iter(layers):
+    for layer in layers:
+        layer_name = re.sub('[-/]', '_', layer.name)
+        layer_type = layer.type
+        layer_blobs = layer.blobs
+        yield (layer_name, layer_type, layer_blobs)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Caffe prototxt to mxnet model parameter converter.\
+                    Note that only basic functions are implemented. You are welcomed to contribute to this file.')
+    parser.add_argument('caffe_prototxt', help='The prototxt file in Caffe format')
+    parser.add_argument('caffe_model', help='The binary model parameter file in Caffe format')
+    parser.add_argument('save_model_name', help='The name of the output model prefix')
+    args = parser.parse_args()
+
+    prob, input_dim = proto2symbol(args.caffe_prototxt)
+
+    layers = ''
+    layer_names = ''
+
+    if caffe_flag:
+        caffe.set_mode_cpu()
+        net_caffe = caffe.Net(args.caffe_prototxt, args.caffe_model, caffe.TEST)
+        layer_names = net_caffe._layer_names
+        layers = net_caffe.layers
+    else:
+        layers = parse.parse_caffemodel(args.caffe_model)
+
+    arg_shapes, output_shapes, aux_shapes = prob.infer_shape(data=tuple(input_dim))
+    arg_names = prob.list_arguments()
+    arg_shape_dic = dict(zip(arg_names, arg_shapes))
+    arg_params = {}
+
+    iter = ''
+    if caffe_flag:
+        iter = get_caffe_iter(layer_names, layers)
+    else:
+        iter = get_iter(layers)
+    first_conv = True
+
+    for layer_name, layer_type, layer_blobs in iter:
+        if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14 \
+                or layer_type == 'PReLU' or layer_type == 'Normalize':
+            if layer_type == 'PReLU':
+                assert (len(layer_blobs) == 1)
+                wmat = layer_blobs[0].data
+                weight_name = layer_name + '_gamma'
+                arg_params[weight_name] = mx.nd.zeros(wmat.shape)
+                arg_params[weight_name][:] = wmat
+                continue
+            if layer_type == 'Normalize':
+                assert (len(layer_blobs) == 1)
+                weight_name = layer_name + '_scale'
+                wmat = layer_blobs[0].data
+                arg_params[weight_name] = mx.nd.zeros((1, len(wmat), 1, 1))
+                arg_params[weight_name][:] = np.array(list(wmat)).reshape((1, len(wmat), 1, 1))
+                continue
+            assert (len(layer_blobs) == 2)
+            wmat_dim = []
+            if getattr(layer_blobs[0].shape, 'dim', None) is not None:
+                if len(layer_blobs[0].shape.dim) > 0:
+                    wmat_dim = layer_blobs[0].shape.dim
+                else:
+                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height,
+                                layer_blobs[0].width]
+            else:
+                wmat_dim = list(layer_blobs[0].shape)
+            wmat = np.array(layer_blobs[0].data).reshape(wmat_dim)
+            bias = np.array(layer_blobs[1].data)
+            channels = wmat_dim[1]
+            if channels == 3 or channels == 4:  # RGB or RGBA
+                if first_conv:
+                    print('Swapping BGR of caffe into RGB in mxnet')
+                    wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
+
+            assert (wmat.flags['C_CONTIGUOUS'] is True)
+            assert (bias.flags['C_CONTIGUOUS'] is True)
+            print('converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape))
+            wmat = wmat.reshape((wmat.shape[0], -1))
+            bias = bias.reshape((bias.shape[0], 1))
+            weight_name = layer_name + "_weight"
+            bias_name = layer_name + "_bias"
+
+            if weight_name not in arg_shape_dic:
+                print(weight_name + ' not found in arg_shape_dic.')
+                continue
+            wmat = wmat.reshape(arg_shape_dic[weight_name])
+            arg_params[weight_name] = mx.nd.zeros(wmat.shape)
+            arg_params[weight_name][:] = wmat
+
+            bias = bias.reshape(arg_shape_dic[bias_name])
+            arg_params[bias_name] = mx.nd.zeros(bias.shape)
+            arg_params[bias_name][:] = bias
+
+            if first_conv and (layer_type == 'Convolution' or layer_type == 4):
+                first_conv = False
+
+    model = mx.mod.Module(symbol=prob, label_names=None)
+    model.bind(data_shapes=[('data', tuple(input_dim))])
+    model.init_params(arg_params=arg_params, aux_params={})
+
+    model.save_checkpoint(args.save_model_name, 1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/example/ssd/tools/caffe_converter/convert_symbol.py b/example/ssd/tools/caffe_converter/convert_symbol.py
new file mode 100644
index 000000000000..63b044a46c97
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/convert_symbol.py
@@ -0,0 +1,326 @@
+from __future__ import print_function
+from google.protobuf import text_format
+import argparse
+import re
+import sys
+import math
+
+caffe_flag = True
+try:
+    import caffe
+    from caffe.proto import caffe_pb2
+except ImportError:
+    caffe_flag = False
+    import caffe_parse.caffe_pb2
+
+
+def read_proto_solver_file(file_path):
+    solver_config = ''
+    if caffe_flag:
+        solver_config = caffe.proto.caffe_pb2.NetParameter()
+    else:
+        solver_config = caffe_parse.caffe_pb2.NetParameter()
+    return read_proto_file(file_path, solver_config)
+
+
+def read_proto_file(file_path, parser_object):
+    file = open(file_path, "r")
+    if not file:
+        raise Exception("ERROR (" + file_path + ")!")
+    text_format.Merge(str(file.read()), parser_object)
+    file.close()
+    return parser_object
+
+
+def conv_param_to_string(param):
+    pad = 0
+    if isinstance(param.pad, int):
+        pad = param.pad
+    else:
+        pad = 0 if len(param.pad) == 0 else param.pad[0]
+    stride = 1
+    if isinstance(param.stride, int):
+        stride = param.stride
+    else:
+        stride = 1 if len(param.stride) == 0 else param.stride[0]
+    kernel_size = ''
+    if isinstance(param.kernel_size, int):
+        kernel_size = param.kernel_size
+    else:
+        kernel_size = param.kernel_size[0]
+    dilate = 1
+    if isinstance(param.dilation, int):
+        dilate = param.dilation
+    else:
+        dilate = 1 if len(param.dilation) == 0 else param.dilation[0]
+    # convert to string except for dilation
+    param_string = "num_filter=%d, pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d), no_bias=%s" % \
+                   (param.num_output, pad, pad, kernel_size, kernel_size, stride, stride, not param.bias_term)
+    # deal with dilation. Won't be in deconvolution
+    if dilate > 1:
+        param_string += ", dilate=(%d, %d)" % (dilate, dilate)
+    return param_string
+
+def find_layer(layers, name):
+    for layer in layers:
+        if layer.name == name:
+            return layer
+    return None
+
+def proto2script(proto_file):
+    proto = read_proto_solver_file(proto_file)
+    connection = dict()
+    symbols = dict()
+    top = dict()
+    flatten_count = 0
+    symbol_string = ""
+    layer = ''
+    if len(proto.layer):
+        layer = proto.layer
+    elif len(proto.layers):
+        layer = proto.layers
+    else:
+        raise Exception('Invalid proto file.')
+        # Get input size to network
+    input_dim = [1, 3, 224, 224]  # default
+    if len(proto.input_dim) > 0:
+        input_dim = proto.input_dim
+    elif len(proto.input_shape) > 0:
+        input_dim = proto.input_shape[0].dim
+    elif layer[0].type == "Input":
+        input_dim = layer[0].input_param.shape._values[0].dim
+        layer.pop(0)
+    else:
+        raise Exception('Invalid proto file.')
+        # We assume the first bottom blob of first layer is the output from data layer
+    input_name = layer[0].bottom[0]
+    output_name = ""
+    mapping = {input_name: 'data'}
+    need_flatten = {input_name: False}
+    for i in range(len(layer)):
+        type_string = ''
+        param_string = ''
+        name = re.sub('[-/]', '_', layer[i].name)
+        from_name = 'data='
+        bottom_order = []
+        if layer[i].type == 'Convolution' or layer[i].type == 4:
+            type_string = 'mx.symbol.Convolution'
+            param_string = conv_param_to_string(layer[i].convolution_param)
+            need_flatten[name] = True
+        if layer[i].type == 'Deconvolution' or layer[i].type == 39:
+            type_string = 'mx.symbol.Deconvolution'
+            param_string = conv_param_to_string(layer[i].convolution_param)
+            need_flatten[name] = True
+        if layer[i].type == 'Pooling' or layer[i].type == 17:
+            type_string = 'mx.symbol.Pooling'
+            param = layer[i].pooling_param
+            param_string = ''
+            param_string += "pooling_convention='full', "
+            if param.global_pooling:
+                # there must be a param `kernel` in a pooling layer
+                param_string += "global_pool=True, kernel=(1,1)"
+            else:
+                param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" % \
+                                (param.pad, param.pad, param.kernel_size, param.kernel_size, param.stride, param.stride)
+            if param.pool == 0:
+                param_string += ", pool_type='max'"
+            elif param.pool == 1:
+                param_string += ", pool_type='avg'"
+            else:
+                raise Exception("Unknown Pooling Method!")
+            need_flatten[name] = True
+        if layer[i].type == 'ReLU' or layer[i].type == 18:
+            type_string = 'mx.symbol.Activation'
+            param_string = "act_type='relu'"
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'TanH' or layer[i].type == 23:
+            type_string = 'mx.symbol.Activation'
+            param_string = "act_type='tanh'"
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Sigmoid' or layer[i].type == 19:
+            type_string = 'mx.symbol.Activation'
+            param_string = "act_type='sigmoid'"
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'LRN' or layer[i].type == 15:
+            type_string = 'mx.symbol.LRN'
+            param = layer[i].lrn_param
+            param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" % \
+                           (param.alpha, param.beta, param.k, param.local_size)
+            need_flatten[name] = True
+        if layer[i].type == 'InnerProduct' or layer[i].type == 14:
+            type_string = 'mx.symbol.FullyConnected'
+            param = layer[i].inner_product_param
+            param_string = "num_hidden=%d, no_bias=%s" % (param.num_output, not param.bias_term)
+            need_flatten[name] = False
+        if layer[i].type == 'Dropout' or layer[i].type == 6:
+            type_string = 'mx.symbol.Dropout'
+            param = layer[i].dropout_param
+            param_string = "p=%f" % param.dropout_ratio
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Softmax' or layer[i].type == 20:
+            if layer[i].softmax_param.axis == 2:
+                symbol_string += "%s = mx.symbol.transpose(%s, axes=(0,2,1))\n" %\
+                    (mapping[layer[i].bottom[0]], mapping[layer[i].bottom[0]])
+                type_string = 'mx.symbol.SoftmaxActivation'
+                param_string = "mode='channel'"
+                need_flatten[name] = False
+            else:
+                type_string = 'mx.symbol.SoftmaxOutput'
+        if layer[i].type == 'Flatten' or layer[i].type == 8:
+            if 'softmax' in layer[i].bottom[0]:
+                type_string = 'identical'
+            else:
+                type_string = 'mx.symbol.Flatten'
+            need_flatten[name] = False
+        if layer[i].type == 'Split' or layer[i].type == 22:
+            type_string = 'split'
+        if layer[i].type == 'Concat' or layer[i].type == 3:
+            type_string = 'mx.symbol.Concat'
+            need_flatten[name] = True
+        if layer[i].type == 'Crop':
+            type_string = 'mx.symbol.Crop'
+            need_flatten[name] = True
+            param_string = 'center_crop=True'
+        if layer[i].type == 'BatchNorm':
+            type_string = 'mx.symbol.BatchNorm'
+            param = layer[i].batch_norm_param
+            param_string = 'use_global_stats=%s' % param.use_global_stats
+        if layer[i].type == 'PReLU':
+            type_string = 'mx.symbol.LeakyReLU'
+            param = layer[i].prelu_param
+            param_string = "act_type='prelu', slope=%f" % param.filler.value
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Normalize':
+            bottom = re.sub('[-/]', '_', layer[i].bottom[0])
+            conv_layer = find_layer(layer, bottom)
+            assert conv_layer is not None
+            param = layer[i].norm_param
+            assert not param.across_spatial and not param.channel_shared
+            assert param.scale_filler.type == 'constant'
+            if conv_layer.type == 'Convolution':
+                scale_name = "%s_scale" % name
+                symbol_string += "%s=mx.sym.Variable(name='%s', shape=(1, %d, 1, 1), init=mx.init.Constant(%f))\n" % \
+                    (scale_name, scale_name, conv_layer.convolution_param.num_output,
+                    param.scale_filler.value)
+                symbol_string += "%s=mx.symbol.L2Normalization(name='%s', data=%s, mode='channel')\n" %\
+                    (name, name, mapping[layer[i].bottom[0]])
+                symbol_string += "%s=mx.symbol.broadcast_mul(lhs=%s, rhs=%s)\n" %\
+                    (name, scale_name, name)
+                type_string = 'split'
+                need_flatten[name] = True
+            else:
+                raise ValueError('Unknown/Invalid normalize layer!')
+        if layer[i].type == 'Permute':
+            type_string = 'mx.symbol.transpose'
+            param_string = "axes=(%s)" % (','.join([str(x) for x in layer[i].permute_param.order]))
+            need_flatten[name] = True
+            from_name = ''
+        if layer[i].type == 'PriorBox':
+            param = layer[i].prior_box_param
+            if layer[i].bottom[0] == 'data':
+                bottom_order = [1]
+            else:
+                bottom_order = [0]
+            try:
+                min_size = param.min_size[0] / input_dim[2]
+                max_size = math.sqrt(param.min_size[0] * param.max_size[0]) / input_dim[2]
+                sizes = '(%f, %f)' %(min_size, max_size)
+            except AttributeError:
+                min_size = param.min_size[0] / input_dim[2]
+                sizes = '(%f)' %(min_size)
+            ars = list(param.aspect_ratio)
+            ratios = [1.]
+            for ar in ars:
+                ratios.append(ar)
+                if param.flip:
+                    ratios.append(1. / ar)
+            ratios_string = '(' + ','.join(str(x) for x in ratios) + ')'
+            clip = param.clip
+            if (param.step_h > 0 or param.step_w > 0):
+                step_h = param.step_h
+                step_w = param.step_w
+            elif param.step > 0:
+                step_h = param.step
+                step_w = param.step
+            else:
+                step_h = -1
+                step_w = -1
+            finput_dim = float(input_dim[2])
+            step = '(%f, %f)' % (step_h / finput_dim, step_w / finput_dim)
+            assert param.offset == 0.5, "currently only support offset = 0.5"
+            symbol_string += '%s = mx.contrib.symbol.MultiBoxPrior(%s, sizes=%s, ratios=%s, clip=%s, steps=%s, name="%s")\n' % \
+                (name, mapping[layer[i].bottom[0]], sizes, ratios_string, clip, step, name)
+            symbol_string += '%s = mx.symbol.Flatten(data=%s)\n' % (name, name)
+            type_string = 'split'
+            need_flatten[name] = False
+        if layer[i].type == 'Reshape':
+            type_string = 'mx.symbol.Reshape'
+            param = layer[i].reshape_param
+            param_string = 'shape=(' + ','.join([str(x) for x in list(param.shape.dim)]) + ')'
+            need_flatten[name] = True
+        if layer[i].type == 'DetectionOutput':
+            bottom_order = [1, 0, 2]
+            param = layer[i].detection_output_param
+            assert param.share_location == True
+            assert param.background_label_id == 0
+            nms_param = param.nms_param
+            type_string = 'mx.contrib.symbol.MultiBoxDetection'
+            param_string = "nms_threshold=%f, nms_topk=%d" % \
+                (nms_param.nms_threshold, nms_param.top_k)
+        if type_string == '':
+            raise Exception('Unknown Layer %s!' % layer[i].type)
+        if type_string == 'identical':
+            bottom = layer[i].bottom
+            symbol_string += "%s = %s\n" % (name, mapping[bottom[0]])
+        elif type_string != 'split':
+            bottom = layer[i].bottom
+            if param_string != "":
+                param_string = ", " + param_string
+            if len(bottom) == 1:
+                if need_flatten[mapping[bottom[0]]] and type_string == 'mx.symbol.FullyConnected':
+                    flatten_name = "flatten_%d" % flatten_count
+                    symbol_string += "%s=mx.symbol.Flatten(name='%s', data=%s)\n" % \
+                                     (flatten_name, flatten_name, mapping[bottom[0]])
+                    flatten_count += 1
+                    need_flatten[flatten_name] = False
+                    bottom[0] = flatten_name
+                    mapping[bottom[0]] = bottom[0]
+                symbol_string += "%s = %s(%s%s %s, name='%s')\n" % \
+                                 (name, type_string, from_name, mapping[bottom[0]], param_string, name)
+            else:
+                if not bottom_order:
+                    bottom_order = range(len(bottom))
+                symbol_string += "%s = %s(name='%s', *[%s] %s)\n" % \
+                                 (name, type_string, name, ','.join([mapping[bottom[x]] for x in bottom_order]), param_string)
+                if layer[i].type == 'Concat' and layer[i].concat_param.axis == 2:
+                    symbol_string += "%s = mx.symbol.Reshape(data=%s, shape=(0, -1, 4), name='%s')\n" %\
+                        (name, name, name)
+        for j in range(len(layer[i].top)):
+            mapping[layer[i].top[j]] = name
+        output_name = name
+    return symbol_string, output_name, input_dim
+
+
+def proto2symbol(proto_file):
+    sym, output_name, input_dim = proto2script(proto_file)
+    sym = "import mxnet as mx\n" \
+          + "data = mx.symbol.Variable(name='data')\n" \
+          + sym
+    exec(sym)
+    _locals = locals()
+    exec("ret = " + output_name, globals(), _locals)
+    ret = _locals['ret']
+    return ret, input_dim
+
+
+def main():
+    symbol_string, output_name, input_dim = proto2script(sys.argv[1])
+    if len(sys.argv) > 2:
+        with open(sys.argv[2], 'w') as fout:
+            fout.write(symbol_string)
+    else:
+        print(symbol_string)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/example/ssd/tools/caffe_converter/make_win32.bat b/example/ssd/tools/caffe_converter/make_win32.bat
new file mode 100644
index 000000000000..7d354dcaeb6c
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/make_win32.bat
@@ -0,0 +1,3 @@
+@protoc --python_out=./ ./caffe_parse/caffe.proto
+@echo done.
+@pause
diff --git a/tools/caffe_converter/mean_image.py b/example/ssd/tools/caffe_converter/mean_image.py
similarity index 100%
rename from tools/caffe_converter/mean_image.py
rename to example/ssd/tools/caffe_converter/mean_image.py
diff --git a/example/ssd/tools/prepare_dataset.py b/example/ssd/tools/prepare_dataset.py
new file mode 100644
index 000000000000..7bd696840766
--- /dev/null
+++ b/example/ssd/tools/prepare_dataset.py
@@ -0,0 +1,83 @@
+from __future__ import print_function
+import sys, os
+import argparse
+import subprocess
+curr_path = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(curr_path, '..'))
+from dataset.pascal_voc import PascalVoc
+from dataset.concat_db import ConcatDB
+
+def load_pascal(image_set, year, devkit_path, shuffle=False):
+    """
+    wrapper function for loading pascal voc dataset
+
+    Parameters:
+    ----------
+    image_set : str
+        train, trainval...
+    year : str
+        2007, 2012 or combinations splitted by comma
+    devkit_path : str
+        root directory of dataset
+    shuffle : bool
+        whether to shuffle initial list
+
+    Returns:
+    ----------
+    Imdb
+    """
+    image_set = [y.strip() for y in image_set.split(',')]
+    assert image_set, "No image_set specified"
+    year = [y.strip() for y in year.split(',')]
+    assert year, "No year specified"
+
+    # make sure (# sets == # years)
+    if len(image_set) > 1 and len(year) == 1:
+        year = year * len(image_set)
+    if len(image_set) == 1 and len(year) > 1:
+        image_set = image_set * len(year)
+    assert len(image_set) == len(year), "Number of sets and year mismatch"
+
+    imdbs = []
+    for s, y in zip(image_set, year):
+        imdbs.append(PascalVoc(s, y, devkit_path, shuffle, is_train=True))
+    if len(imdbs) > 1:
+        return ConcatDB(imdbs, shuffle)
+    else:
+        return imdbs[0]
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Prepare lists for dataset')
+    parser.add_argument('--dataset', dest='dataset', help='dataset to use',
+                        default='pascal', type=str)
+    parser.add_argument('--year', dest='year', help='which year to use',
+                        default='2007,2012', type=str)
+    parser.add_argument('--set', dest='set', help='train, val, trainval, test',
+                        default='trainval', type=str)
+    parser.add_argument('--target', dest='target', help='output list file',
+                        default=os.path.join(curr_path, '..', 'train.lst'),
+                        type=str)
+    parser.add_argument('--root', dest='root_path', help='dataset root path',
+                        default=os.path.join(curr_path, '..', 'data', 'VOCdevkit'),
+                        type=str)
+    parser.add_argument('--shuffle', dest='shuffle', help='shuffle list',
+                        type=bool, default=True)
+    args = parser.parse_args()
+    return args
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.dataset == 'pascal':
+        db = load_pascal(args.set, args.year, args.root_path, args.shuffle)
+        db.save_imglist(args.target, root=args.root_path)
+    else:
+        raise NotImplementedError("No implementation for dataset: " + args.dataset)
+
+    print("List file {} generated...".format(args.target))
+
+    subprocess.check_call(["python",
+        os.path.join(curr_path, "../../../tools/im2rec.py"),
+        os.path.abspath(args.target), os.path.abspath(args.root_path),
+        "--shuffle", str(int(args.shuffle)), "--pack-label", "1"])
+
+    print("Record file {} generated...".format(args.target.split('.')[0] + '.rec'))
diff --git a/example/ssd/tools/prepare_pascal.sh b/example/ssd/tools/prepare_pascal.sh
new file mode 100644
index 000000000000..954327aeefa6
--- /dev/null
+++ b/example/ssd/tools/prepare_pascal.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+python $DIR/prepare_dataset.py --dataset pascal --year 2007,2012 --set trainval --target $DIR/../data/train.lst
+python $DIR/prepare_dataset.py --dataset pascal --year 2007 --set test --target $DIR/../data/val.lst --shuffle False
diff --git a/example/ssd/tools/visualize_net.py b/example/ssd/tools/visualize_net.py
index 714806f61351..e619c230bb90 100644
--- a/example/ssd/tools/visualize_net.py
+++ b/example/ssd/tools/visualize_net.py
@@ -6,8 +6,8 @@
 import sys
 
 parser = argparse.ArgumentParser(description='network visualization')
-parser.add_argument('--network', type=str, default='vgg16_reduced',
-                    choices = ['vgg16_reduced'],
+parser.add_argument('--network', type=str, default='vgg16_ssd_300',
+                    choices = ['vgg16_ssd_300', 'vgg16_ssd_512'],
                     help = 'the cnn to use')
 parser.add_argument('--num-classes', type=int, default=20,
                     help='the number of classes')
diff --git a/example/ssd/train.py b/example/ssd/train.py
index 4f4c67593499..fcd5fb95e8f1 100644
--- a/example/ssd/train.py
+++ b/example/ssd/train.py
@@ -7,20 +7,16 @@
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Train a Single-shot detection network')
-    parser.add_argument('--dataset', dest='dataset', help='which dataset to use',
-                        default='pascal', type=str)
-    parser.add_argument('--image-set', dest='image_set', help='train set, can be trainval or train',
-                        default='trainval', type=str)
-    parser.add_argument('--year', dest='year', help='can be 2007, 2012',
-                        default='2007,2012', type=str)
-    parser.add_argument('--val-image-set', dest='val_image_set', help='validation set, can be val or test',
-                        default='test', type=str)
-    parser.add_argument('--val-year', dest='val_year', help='can be 2007, 2010, 2012',
-                        default='2007', type=str)
-    parser.add_argument('--devkit-path', dest='devkit_path', help='VOCdevkit path',
-                        default=os.path.join(os.getcwd(), 'data', 'VOCdevkit'), type=str)
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        choices=['vgg16_reduced'], help='which network to use')
+    parser.add_argument('--train-path', dest='train_path', help='train record to use',
+                        default=os.path.join(os.getcwd(), 'data', 'train.rec'), type=str)
+    parser.add_argument('--train-list', dest='train_list', help='train list to use',
+                        default="", type=str)
+    parser.add_argument('--val-path', dest='val_path', help='validation record to use',
+                        default=os.path.join(os.getcwd(), 'data', 'val.rec'), type=str)
+    parser.add_argument('--val-list', dest='val_list', help='validation list to use',
+                        default="", type=str)
+    parser.add_argument('--network', dest='network', type=str, default='vgg16_ssd_300',
+                        choices=['vgg16_ssd_300', 'vgg16_ssd_512'], help='which network to use')
     parser.add_argument('--batch-size', dest='batch_size', type=int, default=32,
                         help='training batch size')
     parser.add_argument('--resume', dest='resume', type=int, default=-1,
@@ -38,16 +34,18 @@ def parse_args():
     parser.add_argument('--begin-epoch', dest='begin_epoch', help='begin epoch of training',
                         default=0, type=int)
     parser.add_argument('--end-epoch', dest='end_epoch', help='end epoch of training',
-                        default=100, type=int)
+                        default=240, type=int)
     parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
                         default=20, type=int)
     parser.add_argument('--data-shape', dest='data_shape', type=int, default=300,
                         help='set image shape')
-    parser.add_argument('--lr', dest='learning_rate', type=float, default=0.001,
+    parser.add_argument('--label-width', dest='label_width', type=int, default=350,
+                        help='force padding label width to sync across train and validation')
+    parser.add_argument('--lr', dest='learning_rate', type=float, default=0.004,
                         help='learning rate')
     parser.add_argument('--momentum', dest='momentum', type=float, default=0.9,
                         help='momentum')
-    parser.add_argument('--wd', dest='weight_decay', type=float, default=0.0001,
+    parser.add_argument('--wd', dest='weight_decay', type=float, default=0.0005,
                         help='weight decay')
     parser.add_argument('--mean-r', dest='mean_r', type=float, default=123,
                         help='red mean value')
@@ -55,26 +53,82 @@ def parse_args():
                         help='green mean value')
     parser.add_argument('--mean-b', dest='mean_b', type=float, default=104,
                         help='blue mean value')
-    parser.add_argument('--lr-epoch', dest='lr_refactor_epoch', type=int, default=50,
-                        help='refactor learning rate every N epoch')
-    parser.add_argument('--lr-ratio', dest='lr_refactor_ratio', type=float, default=0.9,
+    parser.add_argument('--lr-steps', dest='lr_refactor_step', type=str, default='150, 200',
+                        help='refactor learning rate at specified epochs')
+    parser.add_argument('--lr-factor', dest='lr_refactor_ratio', type=str, default=0.1,
                         help='ratio to refactor learning rate')
+    parser.add_argument('--freeze', dest='freeze_pattern', type=str, default="^(conv1_|conv2_).*",
+                        help='freeze layer pattern')
     parser.add_argument('--log', dest='log_file', type=str, default="train.log",
                         help='save training log to file')
     parser.add_argument('--monitor', dest='monitor', type=int, default=0,
                         help='log network parameters every N iters if larger than 0')
+    parser.add_argument('--pattern', dest='monitor_pattern', type=str, default=".*",
+                        help='monitor parameter pattern, as regex')
+    parser.add_argument('--num-class', dest='num_class', type=int, default=20,
+                        help='number of classes')
+    parser.add_argument('--num-example', dest='num_example', type=int, default=16551,
+                        help='number of image examples')
+    parser.add_argument('--class-names', dest='class_names', type=str,
+                        default='aeroplane, bicycle, bird, boat, bottle, bus, \
+                        car, cat, chair, cow, diningtable, dog, horse, motorbike, \
+                        person, pottedplant, sheep, sofa, train, tvmonitor',
+                        help='string of comma separated names, or text filename')
+    parser.add_argument('--nms', dest='nms_thresh', type=float, default=0.45,
+                        help='non-maximum suppression threshold')
+    parser.add_argument('--overlap', dest='overlap_thresh', type=float, default=0.5,
+                        help='evaluation overlap threshold')
+    parser.add_argument('--force', dest='force_nms', type=bool, default=False,
+                        help='force non-maximum suppression on different class')
+    parser.add_argument('--use-difficult', dest='use_difficult', type=bool, default=False,
+                        help='use difficult ground-truths in evaluation')
+    parser.add_argument('--voc07', dest='use_voc07_metric', type=bool, default=True,
+                        help='use PASCAL VOC 07 11-point metric')
     args = parser.parse_args()
     return args
 
+def parse_class_names(args):
+    """ parse # classes and class_names if applicable """
+    num_class = args.num_class
+    if len(args.class_names) > 0:
+        if os.path.isfile(args.class_names):
+                # try to open it to read class names
+                with open(args.class_names, 'r') as f:
+                    class_names = [l.strip() for l in f.readlines()]
+        else:
+            class_names = [c.strip() for c in args.class_names.split(',')]
+        assert len(class_names) == num_class, str(len(class_names))
+        for name in class_names:
+            assert len(name) > 0
+    else:
+        class_names = None
+    return class_names
+
 if __name__ == '__main__':
     args = parse_args()
-    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')]
-    ctx = mx.cpu() if not ctx else ctx
-    train_net(args.network, args.dataset, args.image_set, args.year,
-              args.devkit_path, args.batch_size,
+    # context list
+    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()]
+    ctx = [mx.cpu()] if not ctx else ctx
+    # class names if applicable
+    class_names = parse_class_names(args)
+    # start training
+    train_net(args.network, args.train_path,
+              args.num_class, args.batch_size,
               args.data_shape, [args.mean_r, args.mean_g, args.mean_b],
               args.resume, args.finetune, args.pretrained,
               args.epoch, args.prefix, ctx, args.begin_epoch, args.end_epoch,
               args.frequent, args.learning_rate, args.momentum, args.weight_decay,
-              args.val_image_set, args.val_year, args.lr_refactor_epoch,
-              args.lr_refactor_ratio, args.monitor, args.log_file)
+              args.lr_refactor_step, args.lr_refactor_ratio,
+              val_path=args.val_path,
+              num_example=args.num_example,
+              class_names=class_names,
+              label_pad_width=args.label_width,
+              freeze_layer_pattern=args.freeze_pattern,
+              iter_monitor=args.monitor,
+              monitor_pattern=args.monitor_pattern,
+              log_file=args.log_file,
+              nms_thresh=args.nms_thresh,
+              force_nms=args.force_nms,
+              ovp_thresh=args.overlap_thresh,
+              use_difficult=args.use_difficult,
+              voc07_metric=args.use_voc07_metric)
diff --git a/example/ssd/train/initializer.py b/example/ssd/train/initializer.py
deleted file mode 100644
index 985475b2386d..000000000000
--- a/example/ssd/train/initializer.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import mxnet as mx
-
-
-class ScaleInitializer(mx.init.Initializer):
-    """
-    Customized initializer for scale layer
-    """
-    def __init__(self):
-        pass
-
-    def _init_default(self, name, arr):
-        if name.endswith("scale"):
-            self._init_one(name, arr)
-        else:
-            raise ValueError('Unknown initialization pattern for %s' % name)
diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
index 958514549f6c..fa631a5263fc 100644
--- a/example/ssd/train/metric.py
+++ b/example/ssd/train/metric.py
@@ -4,8 +4,9 @@
 
 class MultiBoxMetric(mx.metric.EvalMetric):
     """Calculate metrics for Multibox training """
-    def __init__(self):
-        super(MultiBoxMetric, self).__init__(['Acc', 'ObjectAcc', 'SmoothL1'], 3)
+    def __init__(self, eps=1e-8):
+        super(MultiBoxMetric, self).__init__(['CrossEntropy', 'SmoothL1'], 2)
+        self.eps = eps
 
     def update(self, labels, preds):
         """
@@ -15,18 +16,18 @@ def update(self, labels, preds):
         cls_prob = preds[0].asnumpy()
         loc_loss = preds[1].asnumpy()
         cls_label = preds[2].asnumpy()
+        valid_count = np.sum(cls_label >= 0)
         # overall accuracy & object accuracy
         label = cls_label.flatten()
         mask = np.where(label >= 0)[0]
-        p = np.argmax(cls_prob, axis=1).flatten()
-        self.sum_metric[0] += np.sum(p[mask] == label[mask])
-        self.num_inst[0] += mask.size
-        mask = np.where(label > 0)[0]
-        self.sum_metric[1] += np.sum(p[mask] == label[mask])
-        self.num_inst[1] += mask.size
+        indices = np.int64(label[mask])
+        prob = cls_prob.transpose((0, 2, 1)).reshape((-1, cls_prob.shape[1]))
+        prob = prob[mask, indices]
+        self.sum_metric[0] += (-np.log(prob + self.eps)).sum()
+        self.num_inst[0] += valid_count
         # smoothl1loss
-        self.sum_metric[2] += np.sum(loc_loss)
-        self.num_inst[2] += loc_loss.shape[0]
+        self.sum_metric[1] += np.sum(loc_loss)
+        self.num_inst[1] += valid_count
 
     def get(self):
         """Get the current evaluation result.
diff --git a/example/ssd/train/train_net.py b/example/ssd/train/train_net.py
index bce424b3eac7..3f73ee880ef5 100644
Binary files a/example/ssd/train/train_net.py and b/example/ssd/train/train_net.py differ
diff --git a/example/warpctc/infer_ocr.py b/example/warpctc/infer_ocr.py
new file mode 100644
index 000000000000..2d496f06b1f4
--- /dev/null
+++ b/example/warpctc/infer_ocr.py
@@ -0,0 +1,101 @@
+# coding=utf-8
+# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+import sys
+
+sys.path.insert(0, "../../python")
+from __future__ import print_function
+import numpy as np
+import mxnet as mx
+
+from lstm_model import LSTMInferenceModel
+
+import cv2, random
+from captcha.image import ImageCaptcha
+
+BATCH_SIZE = 32
+SEQ_LENGTH = 80
+
+
+def ctc_label(p):
+    ret = []
+    p1 = [0] + p
+    for i in range(len(p)):
+        c1 = p1[i]
+        c2 = p1[i + 1]
+        if c2 == 0 or c2 == c1:
+            continue
+        ret.append(c2)
+    return ret
+
+
+def remove_blank(l):
+    ret = []
+    for i in range(len(l)):
+        if l[i] == 0:
+            break
+        ret.append(l[i])
+    return ret
+
+
+def gen_rand():
+    buf = ""
+    max_len = random.randint(3,4)
+    for i in range(max_len):
+        buf += str(random.randint(0,9))
+    return buf
+
+if __name__ == '__main__':
+    num_hidden = 100
+    num_lstm_layer = 2
+
+    num_epoch = 10
+    learning_rate = 0.001
+    momentum = 0.9
+    num_label = 4
+
+    n_channel = 1
+    contexts = [mx.context.gpu(0)]
+    _, arg_params, __ = mx.model.load_checkpoint('ocr', num_epoch)
+
+    num = gen_rand()
+    print('Generated number: ' + num)
+    # change the fonts accordingly
+    captcha = ImageCaptcha(fonts=['./data/OpenSans-Regular.ttf'])
+    img = captcha.generate(num)
+    img = np.fromstring(img.getvalue(), dtype='uint8')
+    img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
+    img = cv2.resize(img, (80, 30))
+
+    img = img.transpose(1, 0)
+
+    img = img.reshape((1, 80 * 30))
+    img = np.multiply(img, 1 / 255.0)
+
+    data_shape = [('data', (1, n_channel * 80 * 30))]
+    input_shapes = dict(data_shape)
+
+    model = LSTMInferenceModel(num_lstm_layer,
+                               SEQ_LENGTH,
+                               num_hidden=num_hidden,
+                               num_label=num_label,
+                               arg_params=arg_params,
+                               data_size = n_channel * 30 * 80,
+                               ctx=contexts[0])
+
+    prob = model.forward(mx.nd.array(img))
+
+    p = []
+    for k in range(SEQ_LENGTH):
+        p.append(np.argmax(prob[k]))
+
+    p = ctc_label(p)
+    print('Predicted label: ' + str(p))
+
+    pred = ''
+    for c in p:
+        pred += str((int(c) - 1))
+
+    print('Predicted number: ' + pred)
+
+
diff --git a/example/warpctc/lstm.py b/example/warpctc/lstm.py
index 32ba2455e11d..4be4a0d914f1 100644
--- a/example/warpctc/lstm.py
+++ b/example/warpctc/lstm.py
@@ -77,3 +77,42 @@ def lstm_unroll(num_lstm_layer, seq_len,
     sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)
     return sm
 
+
+def lstm_inference_symbol(num_lstm_layer, seq_len, num_hidden, num_label):
+    param_cells = []
+    last_states = []
+    for i in range(num_lstm_layer):
+        param_cells.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
+                                     i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
+                                     h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
+                                     h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
+        state = LSTMState(c=mx.sym.Variable("l%d_init_c" % i),
+                          h=mx.sym.Variable("l%d_init_h" % i))
+        last_states.append(state)
+    assert (len(last_states) == num_lstm_layer)
+
+    # embeding layer
+    data = mx.sym.Variable('data')
+    wordvec = mx.sym.SliceChannel(data=data, num_outputs=seq_len, squeeze_axis=1)
+
+    hidden_all = []
+    for seqidx in range(seq_len):
+        hidden = wordvec[seqidx]
+        for i in range(num_lstm_layer):
+            next_state = lstm(num_hidden, indata=hidden,
+                              prev_state=last_states[i],
+                              param=param_cells[i],
+                              seqidx=seqidx, layeridx=i)
+            hidden = next_state.h
+            last_states[i] = next_state
+        hidden_all.append(hidden)
+
+    hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
+    fc = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
+    sm = mx.sym.SoftmaxOutput(data=fc, name='softmax')
+
+    output = [sm]
+    for state in last_states:
+        output.append(state.c)
+        output.append(state.h)
+    return mx.sym.Group(output)
diff --git a/example/warpctc/lstm_model.py b/example/warpctc/lstm_model.py
new file mode 100644
index 000000000000..e9c8aa74365f
--- /dev/null
+++ b/example/warpctc/lstm_model.py
@@ -0,0 +1,54 @@
+
+# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme
+# pylint: disable=superfluous-parens, no-member, invalid-name
+import sys
+sys.path.insert(0, "../../python")
+import numpy as np
+import mxnet as mx
+
+from lstm import LSTMState, LSTMParam, lstm, lstm_inference_symbol
+
+
+class LSTMInferenceModel(object):
+    def __init__(self,
+                 num_lstm_layer,
+                 seq_len,
+                 num_hidden,
+                 num_label,
+                 arg_params,
+                 data_size,
+                 ctx=mx.cpu()):
+        self.sym = lstm_inference_symbol(num_lstm_layer,
+                                         seq_len,
+                                         num_hidden,
+                                         num_label)
+
+        batch_size = 1
+        init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
+        init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
+        data_shape = [("data", (batch_size, data_size))]
+        input_shapes = dict(init_c + init_h + data_shape)
+        self.executor = self.sym.simple_bind(ctx=ctx, **input_shapes)
+
+        for key in self.executor.arg_dict.keys():
+            if key in arg_params:
+                arg_params[key].copyto(self.executor.arg_dict[key])
+
+        state_name = []
+        for i in range(num_lstm_layer):
+            state_name.append("l%d_init_c" % i)
+            state_name.append("l%d_init_h" % i)
+
+        self.states_dict = dict(zip(state_name, self.executor.outputs[1:]))
+        self.input_arr = mx.nd.zeros(data_shape[0][1])
+
+    def forward(self, input_data, new_seq=False):
+        if new_seq == True:
+            for key in self.states_dict.keys():
+                self.executor.arg_dict[key][:] = 0.
+        input_data.copyto(self.executor.arg_dict["data"])
+        self.executor.forward()
+        for key in self.states_dict.keys():
+            self.states_dict[key].copyto(self.executor.arg_dict[key])
+        prob = self.executor.outputs[0].asnumpy()
+        return prob
\ No newline at end of file
diff --git a/example/warpctc/lstm_ocr.py b/example/warpctc/lstm_ocr.py
index 25dd8e73c42d..540c676f53e7 100644
--- a/example/warpctc/lstm_ocr.py
+++ b/example/warpctc/lstm_ocr.py
@@ -201,6 +201,7 @@ def sym_gen(seq_len):
     
     print('begin fit')
 
+    prefix = 'ocr'
     model.fit(X=data_train, eval_data=data_val,
               eval_metric = mx.metric.np(Accuracy),
               # Use the following eval_metric if your num_label >= 10, or varies in a wide range
@@ -208,4 +209,4 @@ def sym_gen(seq_len):
               batch_end_callback=mx.callback.Speedometer(BATCH_SIZE, 50),
               epoch_end_callback = mx.callback.do_checkpoint(prefix, 1))
 
-    model.save("ocr")
+    model.save(prefix)
diff --git a/example/warpctc/ocr_predict.py b/example/warpctc/ocr_predict.py
index 7919991aae93..a07733ef55e0 100644
--- a/example/warpctc/ocr_predict.py
+++ b/example/warpctc/ocr_predict.py
@@ -33,6 +33,11 @@ def __init_ocr(self):
         init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]
         init_states = init_c + init_h
 
+        init_state_arrays = np.zeros((batch_size, num_hidden), dtype="float32")
+        self.init_state_dict={}
+        for x in init_states:
+            self.init_state_dict[x[0]] = init_state_arrays
+
         all_shapes = [('data', (batch_size, 80 * 30))] + init_states + [('label', (batch_size, num_label))]
         all_shapes_dict = {}
         for _shape in all_shapes:
@@ -46,7 +51,7 @@ def forward_ocr(self, img):
         img = img.transpose(1, 0)
         img = img.reshape((80 * 30))
         img = np.multiply(img, 1/255.0)
-        self.predictor.forward(data=img)
+        self.predictor.forward(data=img, **self.init_state_dict)
         prob = self.predictor.get_output(0)
         label_list = []
         for p in prob:
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 413105b13f38..34a48e10fd49 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -87,7 +87,7 @@
 /*! \brief minor version */
 #define MXNET_MINOR 9
 /*! \brief patch version */
-#define MXNET_PATCH 4
+#define MXNET_PATCH 5
 /*! \brief mxnet version */
 #define MXNET_VERSION (MXNET_MAJOR*10000 + MXNET_MINOR*100 + MXNET_PATCH)
 /*! \brief helper for making version number */
@@ -304,6 +304,13 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
   out << ctx.dev_id << ")";
   return out;
 }
+
+// describe op registration point
+#define STRINGIZE_DETAIL(x) #x
+#define STRINGIZE(x) STRINGIZE_DETAIL(x)
+#define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
+#define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
+
 }  // namespace mxnet
 
 #include "./tensor_blob.h"
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index fa69cad2b1a3..5056dc0e7244 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -100,44 +100,49 @@ struct NDArrayOpInfo {
   void* p_declare_backward_dependency;
 };
 
-struct CustomOpInfo {
-  bool (*forward)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
-                  const int* /*reqs*/, const bool /*is_train*/, void* /*state*/);
-  bool (*backward)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
-                   const int* /*reqs*/, const bool /*is_train*/, void* /*state*/);
-  bool (*del)(void* /*state*/);
-  // all functions also pass a payload void* pointer
-  void* p_forward;
-  void* p_backward;
-  void* p_del;
+typedef int (*MXGenericCallback)(void);
+
+struct MXCallbackList {
+  int num_callbacks;
+  int (**callbacks)(void);
+  void **contexts;
 };
 
-struct CustomOpPropInfo {
-  bool (*list_arguments)(char*** /*args*/, void* /*state*/);
-  bool (*list_outputs)(char*** /*outputs*/, void* /*state*/);
-  bool (*infer_shape)(int /*num_input*/, int* /*ndims*/, unsigned** /*shapes*/,
-                      void* /*state*/);
-  bool (*declare_backward_dependency)(const int* /*out_grad*/, const int* /*in_data*/,
-                                      const int* /*out_data*/, int* /*num_deps*/,
-                                      int** /*rdeps*/, void* /*state*/);
-  bool (*create_operator)(const char* /*ctx*/, int /*num_inputs*/, unsigned** /*shapes*/,
-                          int* /*ndims*/, int* /*dtypes*/,
-                          struct CustomOpInfo* /*ret*/, void* /*state*/);
-  bool (*list_auxiliary_states)(char*** /*aux*/, void* /*state*/);
-  bool (*del)(void* /*state*/);
-  // all functions also pass a payload void* pointer
-  void* p_list_arguments;
-  void* p_list_outputs;
-  void* p_infer_shape;
-  void* p_declare_backward_dependency;
-  void* p_create_operator;
-  void* p_list_auxiliary_states;
-  void* p_del;
+enum CustomOpCallbacks {
+  kCustomOpDelete,
+  kCustomOpForward,
+  kCustomOpBackward
+};
+
+enum CustomOpPropCallbacks {
+  kCustomOpPropDelete,
+  kCustomOpPropListArguments,
+  kCustomOpPropListOutputs,
+  kCustomOpPropListAuxiliaryStates,
+  kCustomOpPropInferShape,
+  kCustomOpPropDeclareBackwardDependency,
+  kCustomOpPropCreateOperator,
+  kCustomOpPropInferType
 };
 
-typedef bool (*CustomOpPropCreator)(const char* /*op_type*/, const int /*num_kwargs*/,
-                                    const char** /*keys*/, const char** /*values*/,
-                                    struct CustomOpPropInfo* /*ret*/);
+typedef int (*CustomOpFBFunc)(int /*size*/, void** /*ptrs*/, int* /*tags*/,
+                              const int* /*reqs*/, const int /*is_train*/,
+                              void* /*state*/);
+typedef int (*CustomOpDelFunc)(void* /*state*/);
+typedef int (*CustomOpListFunc)(char*** /*args*/, void* /*state*/);
+typedef int (*CustomOpInferShapeFunc)(int /*num_input*/, int* /*ndims*/,
+                                      unsigned** /*shapes*/, void* /*state*/);
+typedef int (*CustomOpInferTypeFunc)(int /*num_input*/, int* /*types*/, void* /*state*/);
+typedef int (*CustomOpBwdDepFunc)(const int* /*out_grad*/, const int* /*in_data*/,
+                                  const int* /*out_data*/, int* /*num_deps*/,
+                                  int** /*rdeps*/, void* /*state*/);
+typedef int (*CustomOpCreateFunc)(const char* /*ctx*/, int /*num_inputs*/,
+                                  unsigned** /*shapes*/, int* /*ndims*/,
+                                  int* /*dtypes*/, struct MXCallbackList* /*ret*/,
+                                  void* /*state*/);
+typedef int (*CustomOpPropCreator)(const char* /*op_type*/, const int /*num_kwargs*/,
+                                     const char** /*keys*/, const char** /*values*/,
+                                     struct MXCallbackList* /*ret*/);
 
 /*!
  * \brief return str message of the last error
@@ -387,7 +392,7 @@ MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle,
-                               mx_float **out_pdata);
+                               void **out_pdata);
 /*!
  * \brief get the type of the data in NDArray
  * \param handle the handle to the narray
@@ -737,6 +742,14 @@ MXNET_DLL int MXSymbolListOutputs(SymbolHandle symbol,
  */
 MXNET_DLL int MXSymbolGetInternals(SymbolHandle symbol,
                                    SymbolHandle *out);
+/*!
+ * \brief Get a symbol that contains only direct children.
+ * \param symbol The symbol
+ * \param out The output symbol whose outputs are the direct children.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXSymbolGetChildren(SymbolHandle symbol,
+                                  SymbolHandle *out);
 /*!
  * \brief Get index-th outputs of the symbol.
  * \param symbol The symbol
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 4106fa688601..1b765233947d 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -64,7 +64,7 @@ class Storage {
    */
   static Storage* Get();
   /*!
-   * \brief Get shared pointer reference to engine singleton.
+   * \brief Get shared pointer reference to storage singleton.
    *  Most user should not call this function.
    *  This function is called by another singleton X who requires
    *  Storage to be destructed after X.
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 10939c9b6318..cfb2511bf1b3 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -302,6 +302,7 @@ namespace dmlc {
 // Add a few patches to support TShape in dmlc/parameter.
 DMLC_DECLARE_TYPE_NAME(mxnet::TShape, "Shape(tuple)");
 DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<int>, "Shape(tuple)");
+DMLC_DECLARE_TYPE_NAME(nnvm::Tuple<dmlc::optional<int>>, "Shape(tuple)");
 
 namespace parameter {
 
diff --git a/make/config.mk b/make/config.mk
index 932d4e037101..f1bbf1da3647 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -151,6 +151,12 @@ USE_S3 = 0
 # path to folders containing projects specific operators that you don't want to put in src/operators
 EXTRA_OPERATORS =
 
+#----------------------------
+# other features
+#----------------------------
+
+# Create C++ interface package
+USE_CPP_PACKAGE = 0
 
 #----------------------------
 # plugins
diff --git a/matlab/+mxnet/private/callmxnet.m b/matlab/+mxnet/private/callmxnet.m
index 3ddaa8acdef7..51f3f6f0c910 100644
--- a/matlab/+mxnet/private/callmxnet.m
+++ b/matlab/+mxnet/private/callmxnet.m
@@ -7,9 +7,10 @@ function callmxnet(func, varargin)
   cd(mxnet_root);
   mxnet_root = pwd;
   cd(cur_pwd);
-  assert(exist([mxnet_root, '/lib/libmxnet.so'], 'file') == 2 || ...
+  
+  assert(exist([mxnet_root, '/lib/libmxnet.so'   ], 'file') == 2 || ...
          exist([mxnet_root, '/lib/libmxnet.dylib'], 'file') == 2 || ...
-         exist([mxnet_root, '/lib/libmxnet.dll'], 'file') == 2, ...
+         exist([mxnet_root, '/lib/libmxnet.dll'  ], 'file') == 2, ...
          'you need to build mxnet first');
   assert(exist([mxnet_root, '/include/mxnet/c_predict_api.h']) == 2, ...
          'failed to find c_predict_api.h')
diff --git a/matlab/README.md b/matlab/README.md
index 8b7715780ff0..939b7011a4f2 100644
--- a/matlab/README.md
+++ b/matlab/README.md
@@ -1,15 +1,51 @@
-# MATLAB binding for mxnet
+# MATLAB binding for MXNet
 
 ### How to use
 
-The only requirment is build mxnet to get `lib/libmxnet.so`. Sample usage
+MXNet needs to be built so that the `lib/libmxnet.so` is available, which can be done by:
 
-- Load model and data:
+```bash
+cd ..
+make
+```
+The pre-trained `Inception-BN` should be downloaded to obtain the symbol and network parameters.
+
+```bash
+./get_inception_model.sh
+```
+
+This data will be saved in the `./data` folder:
+
+```bash
+./data/
+├── cat.png
+├── Inception-BN-0126.params
+├── Inception-BN-symbol.json
+└── synset.txt
+```
+
+####Sample usage
+
+Run the demo script from the command-line without invoking Matlab GUI:
+
+```bash
+matlab -nodisplay -nojvm -nosplash -nodesktop -r "run('./demo.m'), exit(0);"
+```
+or the script may be run from the Matlab GUI as usual.
 
+The script has the following components:
+
+- Load model
+  
   ```matlab
-  img = single(imresize(imread('cat.png'), [224 224])) - 120;
   model = mxnet.model;
-  model.load('model/Inception_BN', 39);
+  model.load('data/Inception-BN', 126);
+  ```
+
+- Load data and normalise.  Here we assume a fixed value of 120 as 'mean image':
+
+  ```matlab
+  img = single(imresize(imread('./data/cat.png'), [224 224])) - 120;
   ```
 
 - Get prediction:
@@ -18,20 +54,21 @@ The only requirment is build mxnet to get `lib/libmxnet.so`. Sample usage
   pred = model.forward(img);
   ```
 
-- Do feature extraction on GPU 0:
+- Do feature extraction on CPU or GPU 0:
 
   ```matlab
-  feas = model.forward(img, 'gpu', 0, {'max_pool_5b_pool', 'global_pool', 'fc'});
+  feas = model.forward(img, {'max_pool_5b_pool', 'global_pool', 'fc1'});           % CPU mode
+  feas = model.forward(img, 'gpu', 0, {'max_pool_5b_pool', 'global_pool', 'fc1'}); % GPU mode
   ```
 
-- See [demo.m](demo.m) for more examples
+- See [demo.m](demo.m) for more details
 
 ### Note on Implementation
 
 We use `loadlibrary` to load mxnet library directly into Matlab and `calllib` to
 call MXNet functions. Note that Matlab uses the column-major to store N-dim
-arraies while and MXNet uses the row-major. So assume we create an array in
-matlab with
+arrays while and MXNet uses the row-major. So assume we create an array in
+Matlab with
 
 ```matlab
 X = zeros([2,3,4,5]);
@@ -40,14 +77,14 @@ X = zeros([2,3,4,5]);
 If we pass the memory of `X` into MXNet, then the correct shape will be
 `[5,4,3,2]` in MXNet. When processing images, MXNet assumes the data layout is
 
-```c++
-example x channel x width x height
+```
+batchSize x channel x width x height
 ```
 
-while in matlab we often store images by
+while in Matlab we often store images in
 
-```matlab
-width x height x channel x example
+```
+width x height x channel x batchSize
 ```
 
 So we should permute the dimensions by `X = permute(X, [2, 1, 3, 4])` before
@@ -57,25 +94,30 @@ passing `X` into MXNet.
 
 1. You may get the error `GLIBCXX_x.x.xx` is not found. Such as on Ubuntu 14.04:
 
-```
-> In loadlibrary (line 359)
-Error using loadlibrary (line 447)
-There was an error loading the library "/home/muli/work/mxnet/lib/libmxnet.so"
-/usr/local/MATLAB/R2015a/bin/glnxa64/../../sys/os/glnxa64/libstdc++.so.6:
-version `GLIBCXX_3.4.18' not found (required by
-/home/muli/work/mxnet/lib/libmxnet.so)
-
-Caused by:
-    Error using loaddefinedlibrary
+    ```
+    > In loadlibrary (line 359)
+    Error using loadlibrary (line 447)
+    There was an error loading the library "/home/muli/work/mxnet/lib/libmxnet.so"
     /usr/local/MATLAB/R2015a/bin/glnxa64/../../sys/os/glnxa64/libstdc++.so.6:
     version `GLIBCXX_3.4.18' not found (required by
     /home/muli/work/mxnet/lib/libmxnet.so)
-```
 
-   One way to fix it is to link `MATLAB_ROOT/sys/os/glnxa64/libstdc++.so.6` to
-   your system's `libstdc++`. For example
-
-```bash
-muli@ghc:/usr/local/MATLAB/R2015a/sys/os/glnxa64$ sudo rm -r libstdc++.so.6
-muli@ghc:/usr/local/MATLAB/R2015a/sys/os/glnxa64$ sudo ln -s /usr/lib/x86_64-linux-gnu/libstdc++.so.6.0.19 libstdc++.so.6
-```
+    Caused by:
+        Error using loaddefinedlibrary
+        /usr/local/MATLAB/R2015a/bin/glnxa64/../../sys/os/glnxa64/libstdc++.so.6:
+        version `GLIBCXX_3.4.18' not found (required by
+        /home/muli/work/mxnet/lib/libmxnet.so)
+    ```
+    
+       One way to fix it is to link `MATLAB_ROOT/sys/os/glnxa64/libstdc++.so.6` to
+       your system's `libstdc++`. For example
+    
+    ```bash
+    muli@ghc:/usr/local/MATLAB/R2015a/sys/os/glnxa64$ sudo rm -r libstdc++.so.6
+    muli@ghc:/usr/local/MATLAB/R2015a/sys/os/glnxa64$ sudo ln -s /usr/lib/x86_64-linux-gnu/    libstdc++.so.6.0.19 libstdc++.so.6
+    ```
+
+
+2. Matlab binding has been tested with the following version:
+
+    `R2016b (9.1.0.441655) 64-bit (glnxa64)`
diff --git a/matlab/demo.m b/matlab/demo.m
index b938af573c4a..a914175ef006 100644
--- a/matlab/demo.m
+++ b/matlab/demo.m
@@ -1,27 +1,20 @@
-%% Download sample image and model
-if ~exist('cat.png', 'file')
-  assert(~system('wget --no-check-certificate https://raw.githubusercontent.com/dmlc/mxnet.js/master/data/cat.png'));
-end
-
-if ~exist('model/Inception_BN-0039.params', 'file')
-  assert(~system('wget --no-check-certificate https://s3.amazonaws.com/dmlc/model/inception-bn.tar.gz'));
-  assert(~system('tar -zxvf inception-bn.tar.gz'))
-end
+%% Assumes model symbol and parameters already downloaded using .sh script
 
 %% Load the model
 clear model
+format compact
 model = mxnet.model;
-model.load('model/Inception_BN', 39);
+model.load('data/Inception-BN', 126);
 
 %% Load and resize the image
-img = imresize(imread('cat.png'), [224 224]);
+img = imresize(imread('data/cat.png'), [224 224]);
 img = single(img) - 120;
 %% Run prediction
 pred = model.forward(img);
 
 %% load the labels
 labels = {};
-fid = fopen('model/synset.txt', 'r');
+fid = fopen('data/synset.txt', 'r');
 assert(fid >= 0);
 tline = fgetl(fid);
 while ischar(tline)
@@ -30,12 +23,15 @@
 end
 fclose(fid);
 
-%% find the predict label
-[p, i] = max(pred);
-fprintf('the best result is %s, with probability %f\n', labels{i}, p)
+%% Print top 5 predictions
+fprintf('Top 5 predictions: \n');
+[p, i] = sort(pred, 'descend');
+for x = 1:5
+    fprintf('    %2.2f%% - %s\n', p(x)*100, labels{i(x)} );
+end
 
 %% Print the last 10 layers in the symbol
-
+fprintf('\nLast 10 layers in the symbol: \n');
 sym = model.parse_symbol();
 layers = {};
 for i = 1 : length(sym.nodes)
@@ -43,13 +39,15 @@
     layers{end+1} = sym.nodes{i}.name;
   end
 end
-fprintf('layer name: %s\n', layers{end-10:end})
+fprintf('    layer name: %s\n', layers{end-10:end})
 
-%% Extract feature from internal layers
 
-feas = model.forward(img, {'max_pool_5b_pool', 'global_pool', 'fc'});
+%% Extract feature from internal layers
+fprintf('\nExtract feature from internal layers using CPU forwarding: \n');
+feas = model.forward(img, {'max_pool_5b_pool', 'global_pool', 'fc1'});
 feas(:)
 
 %% If GPU is available
-% feas = model.forward(img, 'gpu', 0, {'max_pool_5b_pool', 'global_pool', 'fc'});
-% feas(:)
+fprintf('\nExtract feature from internal layers using GPU forwarding: \n');
+feas = model.forward(img, 'gpu', 0, {'max_pool_5b_pool', 'global_pool', 'fc1'});
+feas(:)
diff --git a/matlab/get_inception_model.sh b/matlab/get_inception_model.sh
new file mode 100755
index 000000000000..aa0092deb6d8
--- /dev/null
+++ b/matlab/get_inception_model.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+MATLAB_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${MATLAB_DIR}/data/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+cd ${DATA_DIR}
+
+# Get cat image
+wget --no-check-certificate https://raw.githubusercontent.com/dmlc/mxnet.js/master/data/cat.png;
+
+# Get inception model
+wget --no-check-certificate http://data.dmlc.ml/mxnet/models/imagenet/inception-bn.tar.gz;
+tar -zxvf inception-bn.tar.gz
diff --git a/mshadow b/mshadow
index a9d70f06b791..23210f393942 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit a9d70f06b791f42277ccd76a50d571495de9ca66
+Subproject commit 23210f3939428e42bc34553469ed9ce8c63001ed
diff --git a/nnvm b/nnvm
index aa188e458601..ddf3c17e3455 160000
--- a/nnvm
+++ b/nnvm
@@ -1 +1 @@
-Subproject commit aa188e458601902f8855c46df1ece66949a723a5
+Subproject commit ddf3c17e3455db9cd10f5b18bc9753a146971819
diff --git a/perl-package/.gitignore b/perl-package/.gitignore
new file mode 100644
index 000000000000..f9be8dfe0908
--- /dev/null
+++ b/perl-package/.gitignore
@@ -0,0 +1 @@
+!*
diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
new file mode 100644
index 000000000000..3cd45721b4a5
--- /dev/null
+++ b/perl-package/AI-MXNet/Changes
@@ -0,0 +1,12 @@
+Revision history for Perl extension AI::MXNet
+
+0.95  Sun Mar 26 17:42:02 PDT 2017
+        - docs, bugfixes, tests in order to be visible on http://mxnet.io
+
+0.03  Tue Feb 14 07:28:11 PST 2017
+        - sync up with current state of the Python inteface.
+        - high level RNN support.
+
+0.02  Tue Feb 14 07:28:11 PST 2017
+        - prepared for inclusion to the mxnet code repository.
+
diff --git a/perl-package/AI-MXNet/MANIFEST b/perl-package/AI-MXNet/MANIFEST
new file mode 100644
index 000000000000..caa7c92d0a54
--- /dev/null
+++ b/perl-package/AI-MXNet/MANIFEST
@@ -0,0 +1,72 @@
+META.yml
+MANIFEST
+examples/plot_network.pl
+examples/char_lstm.pl
+examples/get_ptb_data.sh
+examples/lstm_bucketing.pl
+examples/cudnn_lstm_bucketing.pl
+Makefile.PL
+Changes
+META.json
+t/test_recordio.t
+t/test_random.t
+t/test_init.t
+t/test_model_parallel.t
+t/test_optimizers.t
+t/test_multi_device_exec.t
+t/test_io.t
+t/AI-MXNet.t
+t/test_kvstore.t
+t/test_attr.t
+t/test_module.t
+t/test_symbol.t
+t/test_conv.t
+t/test_viz.t
+t/test_rnn.t
+t/test_io_image.t
+t/test_executor.t
+t/test_infer_shape.t
+lib/AI/MXNet.pm
+lib/AI/MXNet/Random.pm
+lib/AI/MXNet/Context.pm
+lib/AI/MXNet/Contrib/Symbol.pm
+lib/AI/MXNet/Contrib/NDArray.pm
+lib/AI/MXNet/Profiler.pm
+lib/AI/MXNet/Module.pm
+lib/AI/MXNet/Monitor.pm
+lib/AI/MXNet/Function/Parameters.pm
+lib/AI/MXNet/Initializer.pm
+lib/AI/MXNet/Types.pm
+lib/AI/MXNet/Util/Printable.pm
+lib/AI/MXNet/Rtc.pm
+lib/AI/MXNet/RNN.pm
+lib/AI/MXNet/Executor.pm
+lib/AI/MXNet/Visualization.pm
+lib/AI/MXNet/Optimizer.pm
+lib/AI/MXNet/Contrib.pm
+lib/AI/MXNet/Image.pm
+lib/AI/MXNet/Symbol/AttrScope.pm
+lib/AI/MXNet/Symbol/Doc.pm
+lib/AI/MXNet/Symbol/Base.pm
+lib/AI/MXNet/Symbol/NameManager.pm
+lib/AI/MXNet/KVStoreServer.pm
+lib/AI/MXNet/KVStore.pm
+lib/AI/MXNet/RecordIO.pm
+lib/AI/MXNet/Base.pm
+lib/AI/MXNet/NDArray/Slice.pm
+lib/AI/MXNet/NDArray/Doc.pm
+lib/AI/MXNet/NDArray/Base.pm
+lib/AI/MXNet/Symbol.pm
+lib/AI/MXNet/Metric.pm
+lib/AI/MXNet/Executor/Group.pm
+lib/AI/MXNet/NDArray.pm
+lib/AI/MXNet/RNN/Cell.pm
+lib/AI/MXNet/RNN/IO.pm
+lib/AI/MXNet/LRScheduler.pm
+lib/AI/MXNet/Callback.pm
+lib/AI/MXNet/IO.pm
+lib/AI/MXNet/Module/Bucketing.pm
+lib/AI/MXNet/Module/Base.pm
+lib/AI/MXNet/TestUtils.pm
+lib/AI/MXNet/Logging.pm
+README
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
new file mode 100644
index 000000000000..4da8c7cf0b2e
--- /dev/null
+++ b/perl-package/AI-MXNet/META.json
@@ -0,0 +1,47 @@
+{
+   "abstract" : "Perl interface to MXNet machine learning library",
+   "author" : [
+      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
+   ],
+   "dynamic_config" : 0,
+   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
+   "license" : [
+      "apache_2_0"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : "2"
+   },
+   "name" : "AI-MXNet",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {}
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "6.30"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "AI::MXNetCAPI" : "0.95",
+            "AI::NNVMCAPI" : "0.95",
+            "Function::Parameters" : "1.0705",
+            "GraphViz" : "2.14",
+            "Mouse" : "v2.1.0",
+            "PDL" : "2.007"
+         }
+      },
+      "test" : {
+         "requires" : {}
+      }
+   },
+   "release_status" : "stable",
+   "version" : "0.95"
+}
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
new file mode 100644
index 000000000000..a91cea1f6ba7
--- /dev/null
+++ b/perl-package/AI-MXNet/META.yml
@@ -0,0 +1,26 @@
+---
+abstract: 'Perl interface to MXNet machine learning library'
+author:
+  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
+build_requires: {}
+configure_requires:
+  ExtUtils::MakeMaker: '6.30'
+dynamic_config: 0
+generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
+license: apache
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: AI-MXNet
+no_index:
+  directory:
+    - t
+    - inc
+requires:
+  AI::MXNetCAPI: '0.95'
+  AI::NNVMCAPI: '0.95'
+  Function::Parameters: '1.0705'
+  GraphViz: '2.14'
+  Mouse: v2.1.0
+  PDL: '2.007'
+version: '0.95'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
new file mode 100644
index 000000000000..0ab07fab42ff
--- /dev/null
+++ b/perl-package/AI-MXNet/Makefile.PL
@@ -0,0 +1,59 @@
+use strict;
+use warnings;
+
+use 5.014000;
+
+use ExtUtils::MakeMaker 6.30;
+
+
+
+my %WriteMakefileArgs = (
+  "ABSTRACT" => "Perl interface to MXNet machine learning library",
+  "AUTHOR" => "Sergey Kolychev <sergeykolychev.github\@gmail.com>",
+  "BUILD_REQUIRES" => {},
+  "CONFIGURE_REQUIRES" => {
+    "ExtUtils::MakeMaker" => "6.30"
+  },
+  "DISTNAME" => "AI-MXNet",
+  "EXE_FILES" => [],
+  "LICENSE" => "apache_2_0",
+  "NAME" => "AI::MXNet",
+  "PREREQ_PM" => {
+    "AI::MXNetCAPI" => "0.95",
+    "AI::NNVMCAPI" => "0.95",
+    "Function::Parameters" => "1.0705",
+    "Mouse" => "2.1.0",
+    "PDL" => "2.007",
+    "GraphViz" => "2.14"
+  },
+  "TEST_REQUIRES" => {},
+  "VERSION" => "0.95",
+  "test" => {
+    "TESTS" => "t/*.t"
+  }
+);
+
+
+my %FallbackPrereqs = (
+  "AI::MXNetCAPI" => "0.95",
+  "AI::NNVMCAPI" => "0.95",
+  "Function::Parameters" => "1.0705",
+  "Mouse" => "2.1.0",
+  "PDL" => "2.007",
+  "GraphViz" => "2.14"
+);
+
+
+unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) {
+  delete $WriteMakefileArgs{TEST_REQUIRES};
+  delete $WriteMakefileArgs{BUILD_REQUIRES};
+  $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs;
+}
+
+delete $WriteMakefileArgs{CONFIGURE_REQUIRES}
+  unless eval { ExtUtils::MakeMaker->VERSION(6.52) };
+
+WriteMakefile(%WriteMakefileArgs);
+
+
+
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
new file mode 100644
index 000000000000..acda682cbfc5
--- /dev/null
+++ b/perl-package/AI-MXNet/README
@@ -0,0 +1,9 @@
+This archive contains the distribution AI-MXNet,
+version 0.95:
+
+  Perl interface to MXNet machine learning library
+
+Copyright (C) 2017 by Sergey Kolychev <sergeykolychev.github@gmail.com>
+
+This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
+
diff --git a/perl-package/AI-MXNet/examples/char_lstm.pl b/perl-package/AI-MXNet/examples/char_lstm.pl
new file mode 100755
index 000000000000..387218318c5d
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/char_lstm.pl
@@ -0,0 +1,195 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use PDL;
+use AI::MXNet qw(mx);
+use AI::MXNet::Function::Parameters;
+use Getopt::Long qw(HelpMessage);
+
+GetOptions(
+    'num-layers=i'   => \(my $num_layers   = 2       ),
+    'num-hidden=i'   => \(my $num_hidden   = 256     ),
+    'num-embed=i'    => \(my $num_embed    = 256     ),
+    'num-seq=i'      => \(my $seq_size     = 32      ),
+    'gpus=s'         => \(my $gpus                   ),
+    'kv-store=s'     => \(my $kv_store     = 'device'),
+    'num-epoch=i'    => \(my $num_epoch    = 25      ),
+    'lr=f'           => \(my $lr           = 0.01    ),
+    'optimizer=s'    => \(my $optimizer    = 'adam'   ),
+    'mom=f'          => \(my $mom          = 0       ),
+    'wd=f'           => \(my $wd           = 0.00001 ),
+    'batch-size=i'   => \(my $batch_size   = 32      ),
+    'disp-batches=i' => \(my $disp_batches = 50      ),
+    'chkp-prefix=s'  => \(my $chkp_prefix  = 'lstm_' ),
+    'chkp-epoch=i'   => \(my $chkp_epoch   = 0       ),
+    'help'           => sub { HelpMessage(0) },
+) or HelpMessage(1);
+
+=head1 NAME
+
+    char_lstm.pl - Example of training char LSTM RNN on tiny shakespeare using high level RNN interface
+
+=head1 SYNOPSIS
+
+    --num-layers     number of stacked RNN layers, default=2
+    --num-hidden     hidden layer size, default=200
+    --num-seq        sequence size, default=32
+    --gpus           list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.
+                     Increase batch size when using multiple gpus for best performance.
+    --kv-store       key-value store type, default='device'
+    --num-epochs     max num of epochs, default=25
+    --lr             initial learning rate, default=0.01
+    --optimizer      the optimizer type, default='adam'
+    --mom            momentum for sgd, default=0.0
+    --wd             weight decay for sgd, default=0.00001
+    --batch-size     the batch size type, default=32
+    --disp-batches   show progress for every n batches, default=50
+    --chkp-prefix    prefix for checkpoint files, default='lstm_'
+    --chkp-epoch     save checkpoint after this many epoch, default=0 (saving checkpoints is disabled)
+
+=cut
+
+package AI::MXNet::RNN::IO::ASCIIIterator;
+use Mouse;
+extends AI::MXNet::DataIter;
+has 'data'          => (is => 'ro',  isa => 'PDL',   required => 1);
+has 'seq_size'      => (is => 'ro',  isa => 'Int',   required => 1);
+has '+batch_size'   => (is => 'ro',  isa => 'Int',   required => 1);
+has 'data_name'     => (is => 'ro',  isa => 'Str',   default => 'data');
+has 'label_name'    => (is => 'ro',  isa => 'Str',   default => 'softmax_label');
+has 'dtype'         => (is => 'ro',  isa => 'Dtype', default => 'float32');
+has [qw/nd counter vocab_size
+    data_size provide_data provide_label idx/] => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->data_size($self->data->nelem);
+    my $segments = int($self->data_size/($self->batch_size*$self->seq_size));
+    $self->idx([0..$segments-1]);
+    $self->vocab_size(65);
+    $self->counter(0);
+    $self->nd(mx->nd->array($self->data, dtype => $self->dtype));
+    my $shape = [$self->batch_size, $self->seq_size];
+    $self->provide_data([
+        AI::MXNet::DataDesc->new(
+            name  => $self->data_name,
+            shape => $shape,
+            dtype => $self->dtype
+        )
+    ]);
+    $self->provide_label([
+        AI::MXNet::DataDesc->new(
+            name  => $self->label_name,
+            shape => $shape,
+            dtype => $self->dtype
+        )
+    ]);
+    $self->reset;
+}
+
+method reset()
+{
+    $self->counter(0);
+    @{ $self->idx } = List::Util::shuffle(@{ $self->idx });
+}
+
+method next()
+{
+    return undef if $self->counter == @{$self->idx};
+    my $offset = $self->idx->[$self->counter]*$self->batch_size*$self->seq_size;
+    my $data = $self->nd->slice(
+        [$offset, $offset + $self->batch_size*$self->seq_size-1]
+    )->reshape([$self->batch_size, $self->seq_size]);
+    my $label = $self->nd->slice(
+        [$offset + 1 , $offset + $self->batch_size*$self->seq_size]
+    )->reshape([$self->batch_size, $self->seq_size]);
+    $self->counter($self->counter + 1);
+    return AI::MXNet::DataBatch->new(
+        data          => [$data],
+        label         => [$label],
+        provide_data  => [
+            AI::MXNet::DataDesc->new(
+                name  => $self->data_name,
+                shape => $data->shape,
+                dtype => $self->dtype
+            )
+        ],
+        provide_label => [
+            AI::MXNet::DataDesc->new(
+                name  => $self->label_name,
+                shape => $label->shape,
+                dtype => $self->dtype
+            )
+        ],
+    );
+}
+
+package main;
+my $file = "data/input.txt";
+open(F, $file) or die "can't open $file: $!";
+my $fdata;
+{ local($/) = undef; $fdata = <F>; close(F) };
+my %vocabulary; my $i = 0;
+$fdata = pdl(map{ exists $vocabulary{$_} ? $vocabulary{$_} : ($vocabulary{$_} = $i++) } split(//, $fdata));
+my $data_iter = AI::MXNet::RNN::IO::ASCIIIterator->new(
+    batch_size => $batch_size,
+    data       => $fdata,
+    seq_size   => $seq_size
+);
+
+my $stack = mx->rnn->SequentialRNNCell();
+for my $i (0..$num_layers-1)
+{
+    $stack->add(mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_l${i}_"));
+}
+
+my $data  = mx->sym->Variable('data');
+my $label = mx->sym->Variable('softmax_label');
+#$data  = mx->sym->Cast(data => $data, dtype => 'int32', name => 'indices');
+#my $one_hot = mx->sym->one_hot(
+#    indices => $data, name => 'one_hot', depth => $data_iter->vocab_size
+#);
+my $embed = mx->sym->Embedding(
+        data => $data, input_dim => scalar(keys %vocabulary),
+        output_dim => $num_embed, name => 'embed'
+);
+$stack->reset;
+my ($outputs, $states) = $stack->unroll($seq_size, inputs => $embed, merge_outputs => 1);
+#my ($outputs, $states) = $stack->unroll($seq_size, inputs => $one_hot, merge_outputs => 1);
+my $pred  = mx->sym->Reshape($outputs, shape => [-1, $num_hidden]);
+$pred     = mx->sym->FullyConnected(data => $pred, num_hidden => $data_iter->vocab_size, name => 'pred');
+$label    = mx->sym->Reshape($label, shape => [-1]);
+my $net   = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
+
+my $contexts;
+if(defined $gpus)
+{
+    $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
+}
+else
+{
+    $contexts = mx->cpu(0);
+}
+
+my $model = mx->mod->Module(
+    symbol  => $net,
+    context => $contexts
+);
+$model->fit(
+    $data_iter,
+    eval_metric         => mx->metric->Perplexity,
+    kvstore             => $kv_store,
+    optimizer           => $optimizer,
+    optimizer_params    => {
+                                learning_rate => $lr,
+                                momentum      => $mom,
+                                wd            => $wd,
+                                clip_gradient => 1,
+                                rescale_grad  => 1/$batch_size
+                        },
+    initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
+    num_epoch           => $num_epoch,
+    batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
+    ($chkp_epoch ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($stack, $chkp_prefix, $chkp_epoch)) : ())
+);
diff --git a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
new file mode 100755
index 000000000000..4cfe51bfd94a
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
@@ -0,0 +1,283 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Base;
+use Getopt::Long qw(HelpMessage);
+
+GetOptions(
+    'test'            => \(my $do_test                ),
+    'num-layers=i'    => \(my $num_layers   = 2       ),
+    'num-hidden=i'    => \(my $num_hidden   = 256     ),
+    'num-embed=i'     => \(my $num_embed    = 256     ),
+    'num-seq=i'       => \(my $seq_size     = 32      ),
+    'gpus=s'          => \(my $gpus                   ),
+    'kv-store=s'      => \(my $kv_store     = 'device'),
+    'num-epoch=i'     => \(my $num_epoch    = 25      ),
+    'lr=f'            => \(my $lr           = 0.01    ),
+    'optimizer=s'     => \(my $optimizer    = 'adam'  ),
+    'mom=f'           => \(my $mom          = 0       ),
+    'wd=f'            => \(my $wd           = 0.00001 ),
+    'batch-size=i'    => \(my $batch_size   = 32      ),
+    'disp-batches=i'  => \(my $disp_batches = 50      ),
+    'model-prefix=s'  => \(my $model_prefix = 'lstm_' ),
+    'load-epoch=i'    => \(my $load_epoch   = 0       ),
+    'stack-rnn'       => \(my $stack_rnn              ),
+    'bidirectional=i' => \(my $bidirectional          ),
+    'dropout=f',      => \(my $dropout      = 0       ),
+    'help'           => sub { HelpMessage(0) },
+) or HelpMessage(1);
+
+=head1 NAME
+
+    char_lstm.pl - Example of training char LSTM RNN on tiny shakespeare using high level RNN interface
+
+=head1 SYNOPSIS
+
+    --test           Whether to test or train (default 0)
+    --num-layers     number of stacked RNN layers, default=2
+    --num-hidden     hidden layer size, default=200
+    --num-seq        sequence size, default=32
+    --gpus           list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.
+                     Increase batch size when using multiple gpus for best performance.
+    --kv-store       key-value store type, default='device'
+    --num-epochs     max num of epochs, default=25
+    --lr             initial learning rate, default=0.01
+    --optimizer      the optimizer type, default='adam'
+    --mom            momentum for sgd, default=0.0
+    --wd             weight decay for sgd, default=0.00001
+    --batch-size     the batch size type, default=32
+    --disp-batches   show progress for every n batches, default=50
+    --model-prefix   prefix for checkpoint files for loading/saving, default='lstm_'
+    --load-epoch     load from epoch
+    --stack-rnn      stack rnn to reduce communication overhead (1,0 default 0)
+    --bidirectional  whether to use bidirectional layers (1,0 default 0)
+    --dropout        dropout probability (1.0 - keep probability), default 0
+=cut
+
+$bidirectional = $bidirectional ? 1 : 0;
+$stack_rnn     = $stack_rnn     ? 1 : 0;
+
+func tokenize_text($fname, :$vocab=, :$invalid_label=-1, :$start_label=0)
+{
+    open(F, $fname) or die "Can't open $fname: $!";
+    my @lines = map { my $l = [split(/ /)]; shift(@$l); $l } (<F>);
+    my $sentences;
+    ($sentences, $vocab) = mx->rnn->encode_sentences(
+        \@lines,
+        vocab         => $vocab,
+        invalid_label => $invalid_label,
+        start_label   => $start_label
+    );
+    return ($sentences, $vocab);
+}
+
+my $buckets = [10, 20, 30, 40, 50, 60];
+my $start_label   = 1;
+my $invalid_label = 0;
+
+func get_data($layout)
+{
+    my ($train_sentences, $vocabulary) = tokenize_text(
+        './data/ptb.train.txt', start_label => $start_label,
+        invalid_label => $invalid_label
+    );
+    my ($validation_sentences) = tokenize_text(
+        './data/ptb.test.txt', vocab => $vocabulary,
+        start_label => $start_label, invalid_label => $invalid_label
+    );
+    my $data_train  = mx->rnn->BucketSentenceIter(
+        $train_sentences, $batch_size, buckets => $buckets,
+        invalid_label => $invalid_label,
+        layout        => $layout
+    );
+    my $data_val    = mx->rnn->BucketSentenceIter(
+        $validation_sentences, $batch_size, buckets => $buckets,
+        invalid_label => $invalid_label,
+        layout        => $layout
+    );
+    return ($data_train, $data_val, $vocabulary);
+}
+
+my $train = sub
+{
+    my ($data_train, $data_val, $vocab) = get_data('TN');
+    my $cell;
+    if($stack_rnn)
+    {
+        my $stack = mx->rnn->SequentialRNNCell();
+        for my $i (0..$num_layers-1)
+        {
+            my $dropout_rate = 0;
+            if($i < $num_layers-1)
+            {
+                $dropout_rate = $dropout;
+            }
+            $stack->add(
+                mx->rnn->FusedRNNCell(
+                    $num_hidden, num_layers => 1,
+                    mode => 'lstm', prefix => "lstm_$i",
+                    bidirectional => $bidirectional, dropout => $dropout_rate
+                )
+            );
+        }
+        $cell = $stack;
+    }
+    else
+    {
+        $cell = mx->rnn->FusedRNNCell(
+            $num_hidden, mode => 'lstm', num_layers => $num_layers,
+            bidirectional => $bidirectional, dropout => $dropout
+        );
+    }
+
+    my $sym_gen = sub { my $seq_len = shift;
+        my $data = mx->sym->Variable('data');
+        my $label = mx->sym->Variable('softmax_label');
+        my $embed = mx->sym->Embedding(data=>$data, input_dim=>scalar(keys %$vocab), output_dim=>$num_embed,name=>'embed');
+        my ($output) = $cell->unroll($seq_len, inputs=>$embed, merge_outputs=>1, layout=>'TNC');
+        my $pred = mx->sym->Reshape($output, shape=>[-1, $num_hidden*(1+$bidirectional)]);
+        $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>scalar(keys %$vocab), name=>'pred');
+        $label = mx->sym->Reshape($label, shape=>[-1]);
+        $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax');
+        return ($pred, ['data'], ['softmax_label']);
+    };
+
+    my $contexts;
+    if(defined $gpus)
+    {
+        $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
+    }
+    else
+    {
+        $contexts = mx->cpu(0);
+    }
+
+    my $model = mx->mod->BucketingModule(
+        sym_gen             => $sym_gen,
+        default_bucket_key  => $data_train->default_bucket_key,
+        context             => $contexts
+    );
+
+    my ($arg_params, $aux_params);
+    if($load_epoch)
+    {
+        (undef, $arg_params, $aux_params) = mx->rnn->load_rnn_checkpoint(
+            $cell, $model_prefix, $load_epoch);
+    }
+    $model->fit(
+        $data_train,
+        eval_data           => $data_val,
+        eval_metric         => mx->metric->Perplexity($invalid_label),
+        kvstore             => $kv_store,
+        optimizer           => $optimizer,
+        optimizer_params    => {
+                                learning_rate => $lr,
+                                momentum      => $mom,
+                                wd            => $wd,
+                            },
+        begin_epoch         => $load_epoch,
+        initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
+        num_epoch           => $num_epoch,
+        batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
+        ($model_prefix ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($cell, $model_prefix, 1)) : ())
+    );
+};
+
+my $test = sub {
+    assert($model_prefix, "Must specifiy path to load from");
+    my (undef, $data_val, $vocab) = get_data('NT');
+    my $stack;
+    if($stack_rnn)
+    {
+        $stack = mx->rnn->SequentialRNNCell();
+        for my $i (0..$num_layers-1)
+        {
+            my $cell = mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_${i}l0_");
+            if($bidirectional)
+            {
+                $cell = mx->rnn->BidirectionalCell(
+                    $cell,
+                    mx->rnn->LSTMCell(
+                        num_hidden => $num_hidden,
+                        prefix => "lstm_${i}r0_"
+                    ),
+                    output_prefix => "bi_lstm_$i"
+                );
+            }
+            $stack->add($cell);
+        }
+    }
+    else
+    {
+        $stack = mx->rnn->FusedRNNCell(
+            $num_hidden,  num_layers    => $num_layers,
+            mode=>'lstm', bidirectional => $bidirectional
+        )->unfuse()
+    }
+    my $sym_gen = sub {
+        my $seq_len = shift;
+        my $data  = mx->sym->Variable('data');
+        my $label = mx->sym->Variable('softmax_label');
+        my $embed = mx->sym->Embedding(
+            data => $data, input_dim => scalar(keys %$vocab),
+            output_dim => $num_embed, name => 'embed'
+        );
+        $stack->reset;
+        my ($outputs, $states) = $stack->unroll($seq_len, inputs => $embed, merge_outputs => 1);
+        my $pred = mx->sym->Reshape($outputs, shape => [-1, $num_hidden*(1+$bidirectional)]);
+        $pred    = mx->sym->FullyConnected(data => $pred, num_hidden => scalar(keys %$vocab), name => 'pred');
+        $label   = mx->sym->Reshape($label, shape => [-1]);
+        $pred    = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
+        return ($pred, ['data'], ['softmax_label']);
+    };
+    my $contexts;
+    if($gpus)
+    {
+        $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
+    }
+    else
+    {
+        $contexts = mx->cpu(0);
+    }
+
+    my ($arg_params, $aux_params);
+    if($load_epoch)
+    {
+        (undef, $arg_params, $aux_params) = mx->rnn->load_rnn_checkpoint(
+            $stack, $model_prefix, $load_epoch);
+    }
+    my $model = mx->mod->BucketingModule(
+        sym_gen             => $sym_gen,
+        default_bucket_key  => $data_val->default_bucket_key,
+        context             => $contexts
+    );
+    $model->bind(
+        data_shapes  => $data_val->provide_data,
+        label_shapes => $data_val->provide_label,
+        for_training => 0,
+        force_rebind => 0
+    );
+    $model->set_params($arg_params, $aux_params);
+    my $score = $model->score($data_val,
+        mx->metric->Perplexity($invalid_label),
+        batch_end_callback=>mx->callback->Speedometer($batch_size, 5)
+    );
+};
+
+if($num_layers >= 4 and split(/,/,$gpus) >= 4 and not $stack_rnn)
+{
+    print("WARNING: stack-rnn is recommended to train complex model on multiple GPUs\n");
+}
+
+if($do_test)
+{
+    # Demonstrates how to load a model trained with CuDNN RNN and predict
+    # with non-fused MXNet symbol
+    $test->();
+}
+else
+{
+    $train->();
+}
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/examples/get_ptb_data.sh b/perl-package/AI-MXNet/examples/get_ptb_data.sh
new file mode 100755
index 000000000000..1ec009aa2f99
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/get_ptb_data.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+RNN_DIR=$(cd `dirname $0`; pwd)
+DATA_DIR="${RNN_DIR}/data/"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} doesn't exist, will create one";
+  mkdir -p ${DATA_DIR}
+fi
+
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/perl-package/AI-MXNet/examples/lstm_bucketing.pl b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
new file mode 100755
index 000000000000..ffc176dccb79
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
@@ -0,0 +1,139 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use PDL;
+use AI::MXNet qw(mx);
+use AI::MXNet::Function::Parameters;
+use Getopt::Long qw(HelpMessage);
+
+GetOptions(
+    'num-layers=i'   => \(my $num_layers   = 2       ),
+    'num-hidden=i'   => \(my $num_hidden   = 200     ),
+    'num-embed=i'    => \(my $num_embed    = 200     ),
+    'gpus=s'         => \(my $gpus                   ),
+    'kv-store=s'     => \(my $kv_store     = 'device'),
+    'num-epoch=i'    => \(my $num_epoch    = 25      ),
+    'lr=f'           => \(my $lr           = 0.01    ),
+    'optimizer=s'    => \(my $optimizer    = 'sgd'   ),
+    'mom=f'          => \(my $mom          = 0       ),
+    'wd=f'           => \(my $wd           = 0.00001 ),
+    'batch-size=i'   => \(my $batch_size   = 32      ),
+    'disp-batches=i' => \(my $disp_batches = 50      ),
+    'chkp-prefix=s'  => \(my $chkp_prefix  = 'lstm_' ),
+    'chkp-epoch=i'   => \(my $chkp_epoch   = 0       ),
+    'help'           => sub { HelpMessage(0) },
+) or HelpMessage(1);
+
+=head1 NAME
+
+    lstm_bucketing.pl - Example of training LSTM RNN on Penn Tree Bank data using high level RNN interface
+
+=head1 SYNOPSIS
+
+    --num-layers     number of stacked RNN layers, default=2
+    --num-hidden     hidden layer size, default=200
+    --num-embed      embedding layer size, default=200
+    --gpus           list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.
+                     Increase batch size when using multiple gpus for best performance.
+    --kv-store       key-value store type, default='device'
+    --num-epochs     max num of epochs, default=25
+    --lr             initial learning rate, default=0.01
+    --optimizer      the optimizer type, default='sgd'
+    --mom            momentum for sgd, default=0.0
+    --wd             weight decay for sgd, default=0.00001
+    --batch-size     the batch size type, default=32
+    --disp-batches   show progress for every n batches, default=50
+    --chkp-prefix    prefix for checkpoint files, default='lstm_'
+    --chkp-epoch     save checkpoint after this many epoch, default=0 (saving checkpoints is disabled)
+
+=cut
+func tokenize_text($fname, :$vocab=, :$invalid_label=-1, :$start_label=0)
+{
+    open(F, $fname) or die "Can't open $fname: $!";
+    my @lines = map { my $l = [split(/ /)]; shift(@$l); $l } (<F>);
+    my $sentences;
+    ($sentences, $vocab) = mx->rnn->encode_sentences(
+        \@lines,
+        vocab         => $vocab,
+        invalid_label => $invalid_label,
+        start_label   => $start_label
+    );
+    return ($sentences, $vocab);
+}
+
+my $buckets = [10, 20, 30, 40, 50, 60];
+my $start_label   = 1;
+my $invalid_label = 0;
+
+my ($train_sentences, $vocabulary) = tokenize_text(
+    './data/ptb.train.txt', start_label => $start_label,
+    invalid_label => $invalid_label
+);
+my ($validation_sentences) = tokenize_text(
+    './data/ptb.test.txt', vocab => $vocabulary,
+    start_label => $start_label, invalid_label => $invalid_label
+);
+my $data_train  = mx->rnn->BucketSentenceIter(
+    $train_sentences, $batch_size, buckets => $buckets,
+    invalid_label => $invalid_label
+);
+my $data_val    = mx->rnn->BucketSentenceIter(
+    $validation_sentences, $batch_size, buckets => $buckets,
+    invalid_label => $invalid_label
+);
+
+my $stack = mx->rnn->SequentialRNNCell();
+for my $i (0..$num_layers-1)
+{
+    $stack->add(mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_l${i}_"));
+}
+
+my $sym_gen = sub {
+    my $seq_len = shift;
+    my $data  = mx->sym->Variable('data');
+    my $label = mx->sym->Variable('softmax_label');
+    my $embed = mx->sym->Embedding(
+        data => $data, input_dim => scalar(keys %$vocabulary),
+        output_dim => $num_embed, name => 'embed'
+    );
+    $stack->reset;
+    my ($outputs, $states) = $stack->unroll($seq_len, inputs => $embed, merge_outputs => 1);
+    my $pred = mx->sym->Reshape($outputs, shape => [-1, $num_hidden]);
+    $pred    = mx->sym->FullyConnected(data => $pred, num_hidden => scalar(keys %$vocabulary), name => 'pred');
+    $label   = mx->sym->Reshape($label, shape => [-1]);
+    $pred    = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
+    return ($pred, ['data'], ['softmax_label']);
+};
+
+my $contexts;
+if(defined $gpus)
+{
+    $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
+}
+else
+{
+    $contexts = mx->cpu(0);
+}
+
+my $model = mx->mod->BucketingModule(
+    sym_gen             => $sym_gen,
+    default_bucket_key  => $data_train->default_bucket_key,
+    context             => $contexts
+);
+
+$model->fit(
+    $data_train,
+    eval_data           => $data_val,
+    eval_metric         => mx->metric->Perplexity($invalid_label),
+    kvstore             => $kv_store,
+    optimizer           => $optimizer,
+    optimizer_params    => {
+                                learning_rate => $lr,
+                                momentum      => $mom,
+                                wd            => $wd,
+                        },
+    initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
+    num_epoch           => $num_epoch,
+    batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
+    ($chkp_epoch ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($stack, $chkp_prefix, $chkp_epoch)) : ())
+);
diff --git a/perl-package/AI-MXNet/examples/plot_network.pl b/perl-package/AI-MXNet/examples/plot_network.pl
new file mode 100755
index 000000000000..a0bcf847af1b
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/plot_network.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+
+### model
+my $data = mx->symbol->Variable('data');
+my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
+my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
+my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
+my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
+
+my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
+my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
+my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
+my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
+
+
+my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
+my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
+my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
+my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
+my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
+
+## creates the image file in working directory, you need GraphViz installed for this to work
+mx->viz->plot_network($softmax, save_format => 'png')->render("network.png");
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
new file mode 100644
index 000000000000..4ae18129d25c
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -0,0 +1,169 @@
+package AI::MXNet;
+use v5.14.0;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Callback;
+use AI::MXNet::NDArray;
+use AI::MXNet::Symbol;
+use AI::MXNet::Executor;
+use AI::MXNet::Executor::Group;
+use AI::MXNet::Rtc;
+use AI::MXNet::Random;
+use AI::MXNet::Initializer;
+use AI::MXNet::Optimizer;
+use AI::MXNet::KVStore;
+use AI::MXNet::KVStoreServer;
+use AI::MXNet::IO;
+use AI::MXNet::Metric;
+use AI::MXNet::LRScheduler;
+use AI::MXNet::Monitor;
+use AI::MXNet::Profiler;
+use AI::MXNet::Module::Base;
+use AI::MXNet::Module;
+use AI::MXNet::Module::Bucketing;
+use AI::MXNet::RNN;
+use AI::MXNet::Visualization;
+use AI::MXNet::RecordIO;
+use AI::MXNet::Image;
+use AI::MXNet::Contrib;
+our $VERSION = '0.95';
+
+sub import
+{
+    my ($class, $short_name) = @_;
+    if($short_name)
+    {
+        $short_name =~ s/[^\w:]//g;
+        if(length $short_name)
+        {
+            my $short_name_package =<<"EOP";
+            package $short_name;
+            no warnings 'redefine';
+            sub nd { 'AI::MXNet::NDArray' }
+            sub sym { 'AI::MXNet::Symbol' }
+            sub symbol { 'AI::MXNet::Symbol' }
+            sub init { 'AI::MXNet::Initializer' }
+            sub initializer { 'AI::MXNet::Initializer' }
+            sub optimizer { 'AI::MXNet::Optimizer' }
+            sub opt { 'AI::MXNet::Optimizer' }
+            sub rnd { 'AI::MXNet::Random' }
+            sub random { 'AI::MXNet::Random' }
+            sub Context { shift; AI::MXNet::Context->new(\@_) }
+            sub cpu { AI::MXNet::Context->cpu(\$_[1]//0) }
+            sub gpu { AI::MXNet::Context->gpu(\$_[1]//0) }
+            sub kv { 'AI::MXNet::KVStore' }
+            sub recordio { 'AI::MXNet::RecordIO' }
+            sub io { 'AI::MXNet::IO' }
+            sub metric { 'AI::MXNet::Metric' }
+            sub mod { 'AI::MXNet::Module' }
+            sub viz { 'AI::MXNet::Visualization' }
+            sub rnn { 'AI::MXNet::RNN' }
+            sub callback { 'AI::MXNet::Callback' }
+            sub img { 'AI::MXNet::Image' }
+            sub contrib { 'AI::MXNet::Contrib' }
+            sub AttrScope { shift; AI::MXNet::Symbol::AttrScope->new(\@_) }
+            *AI::MXNet::Symbol::AttrScope::current = sub { \$${short_name}::AttrScope; };
+            \$${short_name}::AttrScope = AI::MXNet::Symbol::AttrScope->new;
+            *AI::MXNet::Context::current_ctx = sub { \$${short_name}::Context; };
+            \$${short_name}::Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);
+            1;
+EOP
+            eval $short_name_package;
+        }
+    }
+}
+
+1;
+__END__
+
+=encoding UTF-8
+
+=head1 NAME
+
+AI::MXNet - Perl interface to MXNet machine learning library
+
+=head1 SYNOPSIS
+
+    ## Convolutional NN for recognizing hand-written digits in MNIST dataset
+    ## It's considered "Hello, World" for Neural Networks
+    ## For more info about the MNIST problem please refer to http://neuralnetworksanddeeplearning.com/chap1.html
+
+    use strict;
+    use warnings;
+    use AI::MXNet qw(mx);
+    use AI::MXNet::TestUtils qw(GetMNIST_ubyte);
+    use Test::More tests => 1;
+
+    # symbol net
+    my $batch_size = 100;
+
+    ### model
+    my $data = mx->symbol->Variable('data');
+    my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
+    my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
+    my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
+    my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
+
+    my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
+    my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
+    my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
+    my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
+
+
+    my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
+    my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
+    my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
+    my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
+    my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
+
+    # check data
+    GetMNIST_ubyte();
+
+    my $train_dataiter = mx->io->MNISTIter({
+        image=>"data/train-images-idx3-ubyte",
+        label=>"data/train-labels-idx1-ubyte",
+        data_shape=>[1, 28, 28],
+        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0, seed=>10});
+    my $val_dataiter = mx->io->MNISTIter({
+        image=>"data/t10k-images-idx3-ubyte",
+        label=>"data/t10k-labels-idx1-ubyte",
+        data_shape=>[1, 28, 28],
+        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0});
+
+    my $n_epoch = 1;
+    my $mod = mx->mod->new(symbol => $softmax);
+    $mod->fit(
+        $train_dataiter,
+        eval_data => $val_dataiter,
+        optimizer_params=>{learning_rate=>0.01, momentum=> 0.9},
+        num_epoch=>$n_epoch
+    );
+    my $res = $mod->score($val_dataiter, mx->metric->create('acc'));
+    ok($res->{accuracy} > 0.8);
+
+=head1 DESCRIPTION
+
+Perl interface to MXNet machine learning library.
+
+=head1 BUGS AND INCOMPATIBILITIES
+
+Parity with Python inteface is mostly achieved, few deprecated
+and not often used features left unported for now.
+
+=head1 SEE ALSO
+
+http://mxnet.io/
+https://github.com/dmlc/mxnet/tree/master/perl-package
+
+=head1 AUTHOR
+
+Sergey Kolychev, <sergeykolychev.github@gmail.com>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright (C) 2017 by Sergey Kolychev <sergeykolychev.github@gmail.com>
+
+This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
+
+=cut
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
new file mode 100644
index 000000000000..81d4da757de6
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
@@ -0,0 +1,271 @@
+package AI::MXNet::Base;
+use strict;
+use warnings;
+use PDL;
+use PDL::Types qw();
+use AI::MXNetCAPI 0.02;
+use AI::NNVMCAPI 0.02;
+use AI::MXNet::Types;
+use Time::HiRes;
+use Carp;
+use Exporter;
+use base qw(Exporter);
+use List::Util qw(shuffle);
+
+@AI::MXNet::Base::EXPORT = qw(product enumerate assert zip check_call build_param_doc 
+                              pdl cat dog svd bisect_left pdl_shuffle
+                              DTYPE_STR_TO_MX DTYPE_MX_TO_STR DTYPE_MX_TO_PDL
+                              DTYPE_PDL_TO_MX DTYPE_MX_TO_PERL);
+@AI::MXNet::Base::EXPORT_OK = qw(pzeros pceil);
+use constant DTYPE_STR_TO_MX => {
+    float32 => 0,
+    float64 => 1,
+    float16 => 2,
+    uint8   => 3,
+    int32   => 4
+};
+use constant DTYPE_MX_TO_STR => {
+    0 => 'float32',
+    1 => 'float64',
+    2 => 'float16',
+    3 => 'uint8',
+    4 => 'int32'
+};
+use constant DTYPE_MX_TO_PDL => {
+    0 => 6,
+    1 => 7,
+    2 => 6,
+    3 => 0,
+    4 => 3,
+    float32 => 6,
+    float64 => 7,
+    float16 => 6,
+    uint8   => 0,
+    int32   => 3
+};
+use constant DTYPE_PDL_TO_MX => {
+    6 => 0,
+    7 => 1,
+    0 => 3,
+    3 => 4,
+};
+use constant DTYPE_MX_TO_PERL => {
+    0 => 'f',
+    1 => 'd',
+    2 => 'S',
+    3 => 'C',
+    4 => 'l',
+    float32 => 'f',
+    float64 => 'd',
+    float16 => 'S',
+    uint8   => 'C',
+    int32   => 'l'
+};
+
+=head1 NAME
+
+AI::MXNet::Base - Helper functions
+
+=head1 DEFINITION
+
+Helper functions
+
+=head2 zip
+
+Perl version of for x,y,z in zip (arr_x, arr_y, arr_z)
+
+Parameters
+----------
+$sub_ref, called with @_ filled with 
+$arr_x->[$i], $arr_y->[$i], $arr_z->[$i]
+for each loop iteration.
+
+@array_refs
+=cut
+
+sub zip
+{
+    my ($sub, @arrays) = @_;
+    my $len = @{ $arrays[0] };
+    for (my $i = 0; $i < $len; $i++)
+    {
+        $sub->(map { $_->[$i] } @arrays);
+    }
+}
+
+=head2 enumerate
+
+Same as zip, but the argument list in the anonymous sub is prepended
+by the iteration count.
+=cut
+
+sub enumerate
+{
+    my ($sub, @arrays) = @_;
+    my $len = @{ $arrays[0] };
+    zip($sub, [0..$len-1], @arrays);
+}
+
+=head2 product
+
+Calculates the product of the input agruments.
+=cut
+
+sub product
+{
+    my $p = 1;
+    map { $p = $p * $_ } @_;
+    return $p;
+}
+
+=head2 bisect_left
+
+https://hg.python.org/cpython/file/2.7/Lib/bisect.py
+=cut
+
+sub bisect_left
+{
+    my ($a, $x, $lo, $hi) = @_;
+    $lo //= 0;
+    $hi //= @{ $a };
+    if($lo < 0)
+    {
+        Carp::confess('lo must be non-negative');
+    }
+    while($lo < $hi)
+    {
+        my $mid = int(($lo+$hi)/2);
+        if($a->[$mid] < $x)
+        {
+            $lo = $mid+1;
+        }
+        else
+        {
+            $hi = $mid;
+        }
+    }
+    return $lo;
+}
+
+=head2 pdl_shuffle
+
+Shuffle the pdl by the last dimension
+
+Parameters
+-----------
+PDL $pdl
+$preshuffle Maybe[ArrayRef[Index]], if defined the array elements are used
+as shuffled last dimension's indexes
+=cut
+
+
+sub pdl_shuffle
+{
+    my ($pdl, $preshuffle) = @_;
+    my $c = $pdl->copy;
+    my @shuffle = $preshuffle ? @{ $preshuffle } : shuffle(0..$pdl->dim(-1)-1);
+    my $rem = $pdl->ndims-1;
+    for my $i (0..$pdl->dim(-1)-1)
+    {
+        $c->slice(('X')x$rem, $i) .= $pdl->slice(('X')x$rem, $shuffle[$i])
+    }
+    $c;
+}
+
+=head2 assert
+
+Parameters
+-----------
+Bool $input
+Str  $error_str
+Calls Carp::confess with $error_str//"AssertionError" if the $input is false
+=cut
+
+sub assert
+{
+    my ($input, $error_str) = @_;
+    local($Carp::CarpLevel) = 1;
+    Carp::confess($error_str//'AssertionError')
+        unless $input;
+}
+
+=head2 check_call
+
+Checks the return value of C API call
+
+This function will raise exception when error occurs.
+Every API call is wrapped with this function
+
+Returns the C API call return values stripped of first return value,
+checks for return context and returns first element in
+the values list when called in scalar context.
+=cut
+
+sub check_call
+{
+    Carp::confess(AI::MXNetCAPI::GetLastError()) if shift;
+    return wantarray ? @_ : $_[0];
+}
+
+=head2 build_param_doc
+
+Build argument docs in python style.
+
+arg_names : array ref of str
+    Argument names.
+
+arg_types : array ref of str
+    Argument type information.
+
+arg_descs : array ref of str
+    Argument description information.
+
+remove_dup : boolean, optional
+    Whether remove duplication or not.
+
+Returns
+-------
+docstr : str
+    Python docstring of parameter sections.
+=cut
+
+sub build_param_doc
+{
+    my ($arg_names, $arg_types, $arg_descs, $remove_dup) = @_;
+    $remove_dup //= 1;
+    my %param_keys;
+    my @param_str;
+    zip(sub { 
+            my ($key, $type_info, $desc) = @_;
+            return if exists $param_keys{$key} and $remove_dup;
+            $param_keys{$key} = 1;
+            my $ret = sprintf("%s : %s", $key, $type_info);
+            $ret .= "\n    ".$desc if length($desc); 
+            push @param_str,  $ret;
+        },
+        $arg_names, $arg_types, $arg_descs
+    );
+    return sprintf("Parameters\n----------\n%s\n", join("\n", @param_str));
+}
+
+=head2 _notify_shutdown
+
+Notify MXNet about a shutdown.
+=cut
+
+sub _notify_shutdown
+{
+    check_call(AI::MXNetCAPI::NotifyShutdown());
+}
+
+END {
+    _notify_shutdown();
+    Time::HiRes::sleep(0.01);
+}
+
+*pzeros = \&zeros;
+*pceil  = \&ceil;
+## making sure that we can stringify arbitrarily large piddles
+$PDL::toolongtoprint = 1000_000_000;
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
new file mode 100644
index 000000000000..0f588c871b80
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Callback.pm
@@ -0,0 +1,240 @@
+package AI::MXNet::Callback;
+use strict;
+use warnings;
+use List::Util qw/max/;
+use AI::MXNet::Function::Parameters;
+use Mouse;
+use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } };
+
+=head1 NAME
+
+AI::MXNet::Callback - A collection of predefined callback functions
+=cut
+
+=head2 module_checkpoint
+
+Callback to checkpoint Module to prefix every epoch.
+
+Parameters
+----------
+$mod : subclass of AI::MXNet::Module::Base
+    The module to checkpoint.
+$prefix : str
+    The file prefix to checkpoint to
+$period=1 : int
+    How many epochs to wait before checkpointing. Default is 1.
+$save_optimizer_states=0 : Bool
+    Whether to save optimizer states for continue training
+
+Returns
+-------
+$callback : sub ref
+    The callback function that can be passed as iter_end_callback to fit.
+=cut
+
+method module_checkpoint(
+    AI::MXNet::Module::Base $mod,
+    Str $prefix,
+    Int $period=1,
+    Int $save_optimizer_states=0
+)
+{
+    $period = max(1, $period);
+    return sub {
+        my ($iter_no, $sym, $arg, $aux) = @_;
+        if(($iter_no + 1) % $period == 0)
+        {
+            $mod->save_checkpoint($prefix, $iter_no + 1, $save_optimizer_states);
+        }
+    }
+}
+
+=head2 log_train_metric
+
+Callback to log the training evaluation result every period.
+
+Parameters
+----------
+$period : Int
+    The number of batch to log the training evaluation metric.
+$auto_reset : Bool
+    Reset the metric after each log
+
+Returns
+-------
+$callback : sub ref
+    The callback function that can be passed as iter_epoch_callback to fit.
+=cut
+
+method log_train_metric(Int $period, Int $auto_reset=0)
+{
+    return sub {
+        my ($param) = @_;
+        if($param->nbatch % $period == 0 and defined $param->eval_metric)
+        {
+            my $name_value = $param->eval_metric->get_name_value;
+            while(my ($name, $value) = each %{ $name_value })
+            {
+                AI::MXNet::Logging->info(
+                    "Iter[%d] Batch[%d] Train-%s=%f",
+                    $param->epoch, $param->nbatch, $name, $value
+                );
+            }
+            $param->eval_metric->reset if $auto_reset;
+        }
+    }
+}
+
+package AI::MXNet::Speedometer;
+use Mouse;
+use Time::HiRes qw/time/;
+extends 'AI::MXNet::Callback';
+
+=head1 NAME
+
+AI::MXNet::Speedometer - A callback that logs training speed 
+=cut
+
+=head1 DESCRIPTION
+
+Calculate and log training speed periodically.
+
+    Parameters
+    ----------
+    batch_size: int
+        batch_size of data
+    frequent: int
+        How many batches between calculations.
+        Defaults to calculating & logging every 50 batches.
+=cut
+
+has 'batch_size' => (is => 'ro', isa => 'Int', required => 1);
+has 'frequent'   => (is => 'ro', isa => 'Int', default  => 50);
+has 'init'       => (is => 'rw', isa => 'Int', default  => 0);
+has 'tic'        => (is => 'rw', isa => 'Num', default  => 0);
+has 'last_count' => (is => 'rw', isa => 'Int', default  => 0);
+
+method call(AI::MXNet::BatchEndParam $param)
+{
+    my $count = $param->nbatch;
+    if($self->last_count > $count)
+    {
+        $self->init(0);
+    }
+    $self->last_count($count);
+
+    if($self->init)
+    {
+        if(($count % $self->frequent) == 0)
+        {
+            my $speed = $self->frequent * $self->batch_size / (time - $self->tic);
+            if(defined $param->eval_metric)
+            {
+                my $name_value = $param->eval_metric->get_name_value;
+                $param->eval_metric->reset;
+                while(my ($name, $value) = each %{ $name_value })
+                {
+                    AI::MXNet::Logging->info(
+                        "Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f",
+                        $param->epoch, $count, $speed, $name, $value
+                    );
+                }
+            }
+            else
+            {
+                AI::MXNet::Logging->info(
+                    "Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
+                    $param->epoch, $count, $speed
+                );
+            }
+            $self->tic(time);
+        }
+    }
+    else
+    {
+        $self->init(1);
+        $self->tic(time);
+    }
+}
+
+*slice = \&call;
+
+package AI::MXNet::ProgressBar;
+use Mouse;
+extends 'AI::MXNet::Callback';
+
+=head1 NAME
+
+AI::MXNet::ProgressBar - A callback to show a progress bar.
+
+=head1 DESCRIPTION
+
+Show a progress bar.
+
+Parameters
+----------
+total: Int
+    total batch size, 1
+length: Int
+    length or progress bar, 80
+=cut
+
+has 'length'  => (is => 'ro', isa => 'Int', default => 80);
+has 'total'   => (is => 'ro', isa => 'Int', required => 1);
+
+method call(AI::MXNet::BatchEndParam $param)
+{
+    my $count = $param->nbatch;
+    my $filled_len = int(0.5 + $self->length * $count / $self->total);
+    my $percents = int(100.0 * $count / $self->total) + 1;
+    my $prog_bar = ('=' x $filled_len) . ('-' x ($self->length - $filled_len));
+    print "[$prog_bar] $percents%\r";
+}
+
+*slice = \&call;
+
+# Just logs the eval metrics at the end of an epoch.
+package AI::MXNet::LogValidationMetricsCallback;
+use Mouse;
+extends 'AI::MXNet::Callback';
+
+=head1 NAME
+
+AI::MXNet::LogValidationMetricsCallback - A callback to log the eval metrics at the end of an epoch.
+=cut
+
+method call(AI::MXNet::BatchEndParam $param)
+{
+    return unless defined $param->eval_metric;
+    my $name_value = $param->eval_metric->get_name_value;
+    while(my ($name, $value) = each %{ $name_value })
+    {
+        AI::MXNet::Logging->info(
+            "Epoch[%d] Validation-%s=%f",
+            $param->epoch, $name, $value
+        );
+    }
+}
+
+package AI::MXNet::Callback;
+
+method Speedometer()
+{
+    AI::MXNet::Speedometer->new(
+        @_ == 2 ? (batch_size => $_[0], frequent => $_[1]) : (batch_size => $_[0])
+    )
+}
+
+method ProgressBar()
+{
+    AI::MXNet::ProgressBar->new(
+        @_ == 2 ? (total => $_[0], 'length' => $_[1]) : (total => $_[0])
+    )
+}
+
+method LogValidationMetricsCallback()
+{
+    AI::MXNet::LogValidationMetricsCallback->new
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
new file mode 100644
index 000000000000..89aa40f206d4
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
@@ -0,0 +1,138 @@
+package AI::MXNet::Context;
+use strict;
+use warnings;
+use Mouse;
+use AI::MXNet::Types;
+use AI::MXNet::Function::Parameters;
+use constant devtype2str => { 1 => 'cpu', 2 => 'gpu', 3 => 'cpu_pinned' };
+use constant devstr2type => { cpu => 1, gpu => 2, cpu_pinned => 3 };
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(device_type => $_[0])
+        if @_ == 1 and $_[0] =~ /^(?:cpu|gpu|cpu_pinned)$/;
+    return $class->$orig(
+        device_type => $_[0]->device_type,
+        device_id   => $_[0]->device_id
+    ) if @_ == 1 and blessed $_[0];
+    return $class->$orig(device_type => $_[0], device_id => $_[0])
+        if @_ == 2 and $_[0] =~ /^(?:cpu|gpu|cpu_pinned)$/;
+    return $class->$orig(@_);
+};
+
+has 'device_type' => (
+    is => 'rw',
+    isa => enum([qw[cpu gpu cpu_pinned]]),
+    default => 'cpu'
+);
+
+has 'device_type_id' => (
+    is => 'rw',
+    isa => enum([1, 2, 3]),
+    default => sub { devstr2type->{ shift->device_type } },
+    lazy => 1
+);
+
+has 'device_id' => (
+    is => 'rw',
+    isa => 'Int',
+    default => 0
+);
+
+use overload
+    '==' => sub {
+        my ($self, $other) = @_;
+        return 0 unless blessed($other) and $other->isa(__PACKAGE__);
+        return "$self" eq "$other";
+    },
+    '""' => sub {
+        my ($self) = @_;
+        return sprintf("%s(%s)", $self->device_type, $self->device_id);
+    };
+=head1 NAME
+
+AI::MXNet::Context - A device context.
+=cut
+
+=head1 DESCRIPTION
+
+This class governs the device context of AI::MXNet::NDArray objects.
+=cut
+
+=head2
+
+Constructing a context.
+
+Parameters
+----------
+device_type : {'cpu', 'gpu'} or Context.
+    String representing the device type
+
+device_id : int (default=0)
+    The device id of the device, needed for GPU
+=cut
+
+=head2 cpu
+
+Returns a CPU context.
+
+Parameters
+----------
+device_id : int, optional
+The device id of the device. device_id is not needed for CPU.
+This is included to make interface compatible with GPU.
+
+Returns
+-------
+context : AI::MXNet::Context
+    The corresponding CPU context.
+=cut
+
+method cpu(Int $device_id=0)
+{
+    return $self->new(device_type => 'cpu', device_id => $device_id);
+}
+
+=head2 gpu
+
+Returns a GPU context.
+
+Parameters
+----------
+device_id : int, optional
+
+Returns
+-------
+context : AI::MXNet::Context
+    The corresponding GPU context.
+=cut
+
+method gpu(Int $device_id=0)
+{
+    return $self->new(device_type => 'gpu', device_id => $device_id);
+}
+
+=head2 current_context
+
+Returns the current context.
+
+Returns
+-------
+$default_ctx : AI::MXNet::Context
+=cut
+
+method current_ctx()
+{
+    return $AI::MXNet::current_ctx;
+}
+
+method deepcopy()
+{
+    return __PACKAGE__->new(
+                device_type => $self->device_type,
+                device_id => $self->device_id
+    );
+}
+
+$AI::MXNet::current_ctx = __PACKAGE__->new(device_type => 'cpu', device_id => 0);
+
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
new file mode 100644
index 000000000000..5d15f0810f00
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
@@ -0,0 +1,11 @@
+package AI::MXNet::Contrib;
+use strict;
+use warnings;
+use AI::MXNet::Contrib::Symbol;
+use AI::MXNet::Contrib::NDArray;
+
+sub sym    { 'AI::MXNet::Contrib::Symbol'  }
+sub symbol { 'AI::MXNet::Contrib::Symbol'  }
+sub nd     { 'AI::MXNet::Contrib::NDArray' }
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
new file mode 100644
index 000000000000..239f1c48e81f
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/NDArray.pm
@@ -0,0 +1,13 @@
+package AI::MXNet::Contrib::NDArray;
+use strict;
+use warnings;
+
+sub AUTOLOAD {
+    my $sub = $AI::MXNet::Contrib::NDArray::AUTOLOAD;
+    $sub =~ s/.*:://;
+    $sub = "_contrib_$sub";
+    shift;
+    return AI::MXNet::NDArray->$sub(@_);
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
new file mode 100644
index 000000000000..c67cdad4baa5
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/Symbol.pm
@@ -0,0 +1,13 @@
+package AI::MXNet::Contrib::Symbol;
+use strict;
+use warnings;
+
+sub AUTOLOAD {
+    my $sub = $AI::MXNet::Contrib::Symbol::AUTOLOAD;
+    $sub =~ s/.*:://;
+    $sub = "_contrib_$sub";
+    shift;
+    return AI::MXNet::Symbol->$sub(@_);
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
new file mode 100644
index 000000000000..4a3129d59551
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
@@ -0,0 +1,552 @@
+package AI::MXNet::Executor;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Context;
+use Mouse;
+use AI::MXNet::Types;
+use AI::MXNet::Function::Parameters;
+
+has 'handle'            => (is => 'ro', isa => 'ExecutorHandle', required => 1);
+has 'arg_arrays'        => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]');
+has 'grad_arrays'       => (is => 'rw', isa => 'Maybe[ArrayRef[Undef|AI::MXNet::NDArray]]'); 
+has 'aux_arrays'        => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]');
+has '_symbol'           => (is => 'rw', init_arg => 'symbol',    isa => 'AI::MXNet::Symbol');
+has '_ctx'              => (is => 'rw', init_arg => 'ctx',       isa => 'AI::MXNet::Context' );
+has '_grad_req'         => (is => 'rw', init_arg => 'grad_req',  isa => 'Maybe[Str|ArrayRef[Str]|HashRef[Str]]');
+has '_group2ctx'        => (is => 'rw', init_arg => 'group2ctx', isa => 'Maybe[HashRef[AI::MXNet::Context]]');
+has '_monitor_callback' => (is => 'rw', isa => 'CodeRef');
+has [qw/_arg_dict
+        _grad_dict
+        _aux_dict
+        _output_dict
+        outputs
+        _output_dirty/] => (is => 'rw', init_arg => undef);
+=head1 NAME
+
+AI::MXNet::Executor - The actual executing object of MXNet.
+
+=head2 new
+
+Constructor, used AI::MXNet::Symbol->bind and AI::MXNet::Symbol->simple_bind.
+
+Parameters
+----------
+handle: ExecutorHandle
+    ExecutorHandle is generated by calling bind.
+
+See Also
+--------
+AI::MXNet::Symbol->bind : how to create the AI::MXNet::Executor.
+=cut
+
+sub BUILD
+{
+    my $self = shift;
+    my ($symbol, $ctx, $grad_req, $group2ctx)
+        =
+    ($self->_symbol, $self->_ctx, $self->_grad_req, $self->_group2ctx);
+    $symbol = $symbol->deepcopy;
+    $ctx    = $ctx->deepcopy;
+    if(ref $grad_req)
+    {
+        if(ref $grad_req eq 'ARRAY')
+        {
+            $grad_req = [ @{ $grad_req }];
+        }
+        elsif(ref $grad_req eq 'HASH')
+        {
+            $grad_req = { %{ $grad_req } };
+
+        }
+    }
+    if(ref $group2ctx)
+    {
+        $group2ctx = { %{ $group2ctx } };
+    }
+    $self->_symbol($symbol);
+    $self->_ctx($ctx);
+    $self->_grad_req($grad_req);
+    $self->_group2ctx($group2ctx);
+    $self->outputs($self->_get_outputs);
+}
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::ExecutorFree(shift->handle));
+}
+
+# Get the dictionary given name and ndarray pairs.
+func _get_dict(
+    ArrayRef[Str]                       $names,
+    ArrayRef[Maybe[AI::MXNet::NDArray]] $ndarrays
+)
+{
+    my %nset = ();
+    for my $nm (@{ $names })
+    {
+        if(exists $nset{ $nm })
+        {
+            confess("Duplicate names detected, @$names")
+        }
+        $nset{ $nm }++;
+    }
+    my %ret;
+    @ret{ @{ $names } } = @{ $ndarrays };
+    return \%ret;
+}
+
+=head2 outputs
+
+The output ndarrays bound to this executor.
+
+Returns
+-------
+An array ref with AI::MXNet::NDArray objects bound to the heads of the executor.
+=cut
+
+method _get_outputs()
+{
+    return [
+            map {
+                AI::MXNet::NDArray->new(handle => $_)
+            }
+            @{ check_call(AI::MXNetCAPI::ExecutorOutputs($self->handle)) }
+    ];
+}
+
+=head2 forward
+
+Calculate the outputs specified by the bound symbol.
+
+Parameters
+----------
+$is_train=0: bool, optional
+    whether this forward is for evaluation purpose. If True,
+    a backward call is expected to follow. Otherwise following
+    backward is invalid.
+
+**kwargs
+    Additional specification of input arguments.
+
+Examples
+--------
+    >>> # doing forward by specifying data
+    >>> $texec->forward(1, data => $mydata);
+    >>> # doing forward by not specifying things, but copy to the executor before hand
+    >>> $mydata->copyto($texec->arg_dict->{'data'});
+    >>> $texec->forward(1);
+    >>> # doing forward by specifying data and get outputs
+    >>> my $outputs = $texec->forward(1, data => $mydata);
+    >>> print $outputs->[0]->aspdl;
+=cut
+
+method forward(Int $is_train=0, %kwargs)
+{
+    if(%kwargs)
+    {
+        my $arg_dict = $self->arg_dict;
+        while (my ($name, $array) = each %kwargs)
+        {
+            if(not find_type_constraint('AcceptableInput')->check($array))
+            {
+                confess('only accept keyword argument of NDArrays/PDLs/Perl Array refs');
+            }
+            if(not exists $arg_dict->{ $name })
+            {
+                confess("unknown argument $name");
+            }
+            if(not blessed($array) or not $array->isa('AI::MXNet::NDArray'))
+            {
+                $array = AI::MXNet::NDArray->array($array);
+            }
+            if(join(',', @{ $arg_dict->{$name}->shape }) ne join(',', @{ $array->shape }))
+            {
+                my $expected = $arg_dict->{$name}->shape;
+                my $got = $array->shape;
+                confess("Shape not match! Argument $name, need: @$expected, received: @$got'");
+            }
+            $arg_dict->{ $name } .= $array;
+        }
+    }
+    check_call(AI::MXNetCAPI::ExecutorForward(
+            $self->handle,
+            $is_train
+        )
+    );
+    if($self->_output_dirty)
+    {
+        AI::MXNet::Logging->warning(
+            "Calling forward the second time after forward(is_train=1) "
+            ."without calling backward first. Is this intended?"
+        );
+    }
+    $self->_output_dirty($is_train);
+    return $self->outputs;
+}
+
+=head2 backward
+
+Do a backward pass to get the gradient of the arguments.
+
+Parameters
+----------
+out_grads : NDArray or an array ref of NDArrays or hash ref of NDArrays, optional.
+    The gradient on the outputs to be propagated back.
+    This parameter is only needed when bind is called
+    on outputs that are not a loss function.
+=cut
+
+method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|HashRef[AI::MXNet::NDArray]] $out_grads=)
+{
+    $out_grads //= [];
+    if(blessed $out_grads)
+    {
+        $out_grads = [$out_grads];
+    }
+    elsif(ref $out_grads eq 'HASH')
+    {
+        $out_grads = [ @{ $out_grads }{ @{ $self->symbol->list_outputs() } } ];
+    }
+    check_call(
+        AI::MXNetCAPI::ExecutorBackward(
+            $self->handle,
+            scalar(@{ $out_grads }),
+            [map { $_->handle } @{ $out_grads }]
+        )
+    );
+    if(not $self->_output_dirty)
+    {
+        AI::MXNet::Logging->warning(
+            "Calling backward without calling forward(is_train=True) "
+            ."first. Behavior is undefined."
+        );
+    }
+    $self->_output_dirty(0);
+}
+
+=head2 set_monitor_callback
+
+Install callback.
+
+Parameters
+----------
+callback : subref
+    Takes a string and an NDArrayHandle.
+=cut
+
+method set_monitor_callback(CodeRef $callback)
+{
+    $self->_monitor_callback($callback);
+    check_call(
+        AI::MXNetCAPI::ExecutorSetMonitorCallback(
+            $self->handle,
+            $self->_monitor_callback
+        )
+    );
+}
+
+=head2 arg_dict
+
+Get a hash ref representation of the argument arrays.
+
+Returns
+-------
+arg_dict : HashRef[AI::MXNet::NDArray]
+    The map that maps a name of the arguments to the NDArrays.
+=cut
+
+method arg_dict()
+{
+    if(not defined $self->_arg_dict)
+    {
+        $self->_arg_dict(_get_dict(
+                $self->_symbol->list_arguments(),
+                $self->arg_arrays
+            )
+        );
+    }
+    return $self->_arg_dict;
+}
+
+=head2 grad_dict
+
+Get a hash ref representation of the gradient arrays.
+
+Returns
+-------
+grad_dict : HashRef[AI::MXNet::NDArray]
+    The map that maps a name of the arguments to the gradient NDArrays.
+=cut
+
+method grad_dict()
+{
+    if(not defined $self->_grad_dict)
+    {
+        $self->_grad_dict(_get_dict(
+                $self->_symbol->list_arguments(),
+                $self->grad_arrays
+            )
+        );
+    }
+    return $self->_grad_dict;
+}
+
+=head2 aux_dict
+
+Get a hash ref representation of the auxiliary states arrays.
+
+Returns
+-------
+aux_dict : HashRef[AI::MXNet::NDArray]
+    The map that maps a name of the auxiliary states to the NDArrays.
+=cut
+
+method aux_dict()
+{
+    if(not defined $self->_aux_dict)
+    {
+        $self->_aux_dict(_get_dict(
+                $self->_symbol->list_auxiliary_states(),
+                $self->aux_arrays()
+            )
+        );
+    }
+    return $self->_aux_dict;
+}
+
+=head2 output_dict
+
+Get a hash ref representation of the output arrays.
+
+Returns
+-------
+output_dict : HashRef[AI::MXNet::NDArray]
+    The map that maps a name of the outputs to the NDArrays.
+=cut
+
+method output_dict()
+{
+    if(not defined $self->_output_dict)
+    {
+        $self->_output_dict(_get_dict(
+                $self->_symbol->list_outputs(),
+                $self->outputs
+            )
+        );
+    }
+    return $self->_output_dict;
+}
+
+=head2 copy_params_from
+
+Copy parameters from arg_params, aux_params into the executor's internal array.
+
+Parameters
+----------
+arg_params : HashRef[AI::MXNet::NDArray]
+    Parameters, hash ref of name to NDArray of arguments
+
+aux_params : Maybe[HashRef[AI::MXNet::NDArray]], optional
+    Parameters, hash ref of name to NDArray of auxiliary states.
+
+allow_extra_params : boolean, optional
+    Whether allow extra parameters that are not needed by symbol
+    If this is True, no error will be thrown when arg_params or aux_params
+    contain extra parameters that is not needed by the executor.
+=cut
+
+method copy_params_from(
+    HashRef[AI::MXNet::NDArray]        $arg_params,
+    Maybe[HashRef[AI::MXNet::NDArray]] $aux_params=,
+    Maybe[Bool]                        $allow_extra_params=
+)
+{
+    my %arg_dict = %{ $self->arg_dict };
+    while (my ($name, $array) = each %{ $arg_params })
+    {
+        if(exists $arg_dict{ $name })
+        {
+            my $dst = $arg_dict{ $name };
+            $array->astype($dst->dtype)->copyto($dst);
+        }
+        elsif(not $allow_extra_params)
+        {
+            confess("Found name \"$name\" that is not in the arguments");
+        }
+    }
+    if(defined $aux_params)
+    {
+        my %aux_dict = %{ $self->aux_dict };
+        while (my ($name, $array) = each %{ $aux_params })
+        {
+            if(exists $aux_dict{ $name })
+            {
+                my $dst = $aux_dict{ $name };
+                $array->astype($dst->dtype)->copyto($dst);
+            }
+            elsif(not $allow_extra_params)
+            {
+                confess("Found name \"$name\" that is not in the arguments");
+            }
+        }
+    }
+}
+
+=head2 reshape
+
+Returns a new executor with the same symbol and shared memory,
+but different input/output shapes.
+For runtime reshaping, variable length sequences, etc.
+The returned executor shares state with the current one,
+and cannot be used in parallel with it.
+
+Parameters
+----------
+$kwargs : HashRef[Shape]
+    new shape for arguments.
+:$partial_shaping : bool
+    Whether to allow changing the shape of unspecified arguments.
+:$allow_up_sizing : bool
+    Whether to allow allocating new ndarrays that's larger than the original.
+Returns
+-------
+$exec : AI::MXNet::Executor
+    A new executor that shares memory with self.
+=cut
+
+
+method reshape(HashRef[Shape] $kwargs, Int :$partial_shaping=0, Int :$allow_up_sizing=0)
+{
+    my ($arg_shapes, undef, $aux_shapes) = $self->_symbol->infer_shape(%{ $kwargs });
+    confess("Insufficient argument shapes provided.") 
+        unless defined $arg_shapes;
+    my %new_arg_dict;
+    my %new_grad_dict;
+    my $i = 0;
+    for my $name (@{ $self->_symbol->list_arguments() })
+    {
+        my $new_shape = $arg_shapes->[$i];
+        my $arr       = $self->arg_arrays->[$i];
+        my $darr;
+        if(@{ $self->grad_arrays })
+        {
+            $darr = $self->grad_arrays->[$i];
+        }
+        if(
+            $partial_shaping
+                or
+            exists $kwargs->{ $name }
+                or
+            join(',', @{ $new_shape }) eq join(',', @{ $arr->shape })
+        )
+        {
+            if(AI::MXNet::NDArray->size($new_shape) > $arr->size)
+            {
+                confess(
+                    "New shape of arg:$name larger than original. "
+                    ."First making a big executor and then down sizing it "
+                    ."is more efficient than the reverse."
+                    ."If you really want to up size, set \$allow_up_sizing=1 "
+                    ."to enable allocation of new arrays."
+                ) unless $allow_up_sizing;
+                $new_arg_dict{ $name }  = AI::MXNet::NDArray->empty(
+                    $new_shape,
+                    ctx => $arr->context,
+                    dtype => $arr->dtype
+                );
+                if(defined $darr)
+                {
+                    $new_grad_dict{ $name } = AI::MXNet::NDArray->empty(
+                        $new_shape,
+                        ctx => $darr->context,
+                        dtype => $arr->dtype
+                    );
+                }
+            }
+            else
+            {
+                $new_arg_dict{ $name } = $arr->reshape($new_shape);
+                if(defined $darr)
+                {
+                    $new_grad_dict{ $name } = $darr->reshape($new_shape);
+                }
+            }
+        }
+        else
+        {
+            confess(
+                    "Shape of unspecified array arg:$name changed. "
+                    ."This can cause the new executor to not share parameters "
+                    ."with the old one. Please check for error in network."
+                    ."If this is intended, set partial_shaping=True to suppress this warning."
+            );
+        }
+        $i++;
+    }
+    my %new_aux_dict;
+    $i = 0;
+    for my $name (@{ $self->_symbol->list_auxiliary_states() })
+    {
+        my $new_shape = $aux_shapes->[$i];
+        my $arr = $self->aux_arrays->[$i];
+        if($partial_shaping or join(',', @{ $new_shape }) eq join (',', @{ $arr->shape }))
+        {
+            if(AI::MXNet::NDArray->size($new_shape) > $arr->size)
+            {
+                confess(
+                    "New shape of arg:$name larger than original. "
+                    ."First making a big executor and then down sizing it "
+                    ."is more efficient than the reverse."
+                    ."If you really want to up size, set \$allow_up_sizing=1 "
+                    ."to enable allocation of new arrays."
+                ) unless $allow_up_sizing;
+                $new_aux_dict{ $name }  = AI::MXNet::NDArray->empty(
+                    $new_shape,
+                    ctx => $arr->context,
+                    dtype => $arr->dtype
+                );
+            }
+            else
+            {
+                $new_aux_dict{ $name } = $arr->reshape($new_shape);
+            }
+        }
+        else
+        {
+            confess(
+                "Shape of unspecified array aux:$name changed. "
+                ."This can cause the new executor to not share parameters "
+                ."with the old one. Please check for error in network."
+                ."If this is intended, set partial_shaping=True to suppress this warning."
+            );
+        }
+        $i++;
+    }
+    return $self->_symbol->bind(
+                ctx         => $self->_ctx,
+                args        => \%new_arg_dict,
+                args_grad   => \%new_grad_dict,
+                grad_req    => $self->_grad_req,
+                aux_states  => \%new_aux_dict,
+                group2ctx   => $self->_group2ctx,
+                shared_exec => $self
+    );
+}
+
+=head2 debug_str
+
+A debug string about the internal execution plan.
+
+Returns
+-------
+debug_str : string
+    Debug string of the executor.
+=cut
+
+method debug_str()
+{
+    return scalar(check_call(AI::MXNetCAPI::ExecutorPrint($self->handle)));
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
new file mode 100644
index 000000000000..4796d759aeb1
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
@@ -0,0 +1,1004 @@
+package AI::MXNet::Executor::Group;
+use strict;
+use warnings;
+use Scalar::Util qw(blessed);
+use List::Util qw(sum min);
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::Executor::Group - Manager for a group of executors working in different contexts.
+=cut
+
+func _split_input_slice($batch_size, $work_load_list)
+{
+    my $total_work_load = sum(@{ $work_load_list });
+    my @batch_num_list = map { # perl does not have builtin round
+        int(($_ * $batch_size / $total_work_load) + 0.5)
+    } @{ $work_load_list };
+    my $batch_num_sum = sum(@batch_num_list);
+    my @slices;
+    if($batch_num_sum < $batch_size)
+    {
+        $batch_num_list[-1] += $batch_size - $batch_num_sum;
+    }
+    my $end = 0;
+    for my $batch_num (@batch_num_list)
+    {
+        my $begin = int(min($end, $batch_size));
+        $end = int(min($begin + $batch_num, $batch_size));
+        if($begin >= $end)
+        {
+            confess('Too many slices such that some splits are empty');
+        }
+        push @slices, [$begin, $end];
+    }
+    return \@slices;
+}
+# Load a array ref of arrays into a array ref of arrays specified by slices
+func _load_general($data, $targets, $major_axis)
+{
+    zip(sub {
+        my ($d_src, $d_targets, $axis) = @_;
+        if(blessed($d_targets) and $d_targets->isa('AI::MXNet::NDarray'))
+        {
+            $d_src->copyto($d_targets);
+        }
+        elsif(ref $d_targets eq 'ARRAY' and blessed $d_targets->[0])
+        {
+            zip(sub {
+                my ($src, $dst) = @_;
+                $src->copyto($dst);
+            }, $d_src, $d_targets);
+        }
+        else
+        {
+            for my $d (@{ $d_targets })
+            {
+                my ($slice_idx, $d_dst) = @{ $d };
+                if($axis >= 0)
+                {
+                    # copy slice
+                    my $end   = $d_src->shape;
+                    my $begin = [(0) x @{ $end }];
+                    $begin->[$axis] = $slice_idx->[0];
+                    $end->[$axis]   = $slice_idx->[1];
+                    if($d_src->context == $d_dst->context)
+                    {
+                        $d_src->crop({ begin => $begin, end => $end, out => $d_dst });
+                    }
+                    else
+                    {
+                        # on different device, crop and then do cross device copy
+                        my $d_dst_copy = $d_src->crop({ begin => $begin, end => $end });
+                        $d_dst_copy->copyto($d_dst);
+                    }
+                }
+                else
+                {
+                    $d_src->copyto($d_dst);
+                }
+            }
+        }
+    }, $data, $targets, $major_axis);
+}
+
+# Load data into sliced arrays
+func _load_data($batch, $targets, $major_axis)
+{
+    _load_general($batch->data, $targets, $major_axis);
+}
+
+# Load label into sliced arrays
+func _load_label($batch, $targets, $major_axis)
+{
+    _load_general($batch->label, $targets, $major_axis);
+}
+
+# Merge outputs that live on multiple context into one, so that they look
+# like living on one context.
+func _merge_multi_context($outputs, $major_axis)
+{
+    my @rets;
+    zip(sub {
+        my ($tensors, $axis) = @_;
+        if($axis >= 0)
+        {
+            if(@$tensors == 1)
+            {
+                push @rets, $tensors->[0];
+            }
+            else
+            {
+                my $ctx = $tensors->[0]->context;
+                push @rets, AI::MXNet::NDArray->concat((map { $_->as_in_context($ctx) } @$tensors), { dim => $axis });
+            }
+        }
+        else
+        {
+            # negative axis means the there is no batch_size axis, and all the
+            # results should be the same on each device. We simply take the
+            # first one, without checking they are actually the same
+            push @rets, $tensors->[0];
+        }
+    }, $outputs, $major_axis);
+    return \@rets;
+}
+
+## TODO
+## this class is here because of https://github.com/gfx/p5-Mouse/pull/67
+## once 2.4.7 version of Mouse in Ubuntu for affected Perl version
+## these accessors should be merged into main class
+package AI::MXNet::DataParallelExecutorGroup::_private;
+use Mouse;
+has [qw/output_layouts label_layouts arg_names aux_names
+        batch_size slices execs data_arrays
+        label_arrays param_arrays grad_arrays aux_arrays
+        data_layouts shared_data_arrays input_grad_arrays
+        _default_execs state_arrays/
+    ] => (is => 'rw', init_arg => undef);
+
+package AI::MXNet::DataParallelExecutorGroup;
+use Mouse;
+use AI::MXNet::Base;
+use List::Util qw(sum);
+
+=head1 DESCRIPTION
+
+    DataParallelExecutorGroup is a group of executors that lives on a group of devices.
+    This is a helper class used to implement data parallelization. Each mini-batch will
+    be split and run on the devices.
+
+    Parameters for constructor
+    ----------
+    symbol : AI::MXNet::Symbol
+        The common symbolic computation graph for all executors.
+    contexts : ArrayRef[AI::MXNet::Context]
+        A array ref of contexts.
+    workload : ArrayRef[Num]
+        If not undef, could be an array ref of numbers that specify the workload to be assigned
+        to different context. Larger number indicate heavier workload.
+    data_shapes : ArrayRef[NameShape|AI::MXNet::DataDesc]
+        Should be a array ref of [name, shape] array refs, for the shapes of data. Note the order is
+        important and should be the same as the order that the `DataIter` provide the data.
+    label_shapes : Maybe[ArrayRef[NameShape|AI::MXNet::DataDesc]]
+        Should be a array ref of [$name, $shape] array refs, for the shapes of label. Note the order is
+        important and should be the same as the order that the `DataIter` provide the label.
+    param_names : ArrayRef[Str]
+        A array ref of strings, indicating the names of parameters (e.g. weights, filters, etc.)
+        in the computation graph.
+    for_training : Bool
+        Indicate whether the executors should be bind for training. When not doing training,
+        the memory for gradients will not be allocated.
+    inputs_need_grad : Bool
+        Indicate whether the gradients for the input data should be computed. This is currently
+        not used. It will be useful for implementing composition of modules.
+    shared_group : AI::MXNet::DataParallelExecutorGroup
+        Default is undef. This is used in bucketing. When not undef, it should be a executor
+        group corresponding to a different bucket. In other words, it will correspond to a different
+        symbol but with the same set of parameters (e.g. unrolled RNNs with different lengths).
+        In this case, many memory will be shared.
+    logger : Logger
+        Default is AI::MXNet::Logging->get_logger.
+    fixed_param_names: Maybe[ArrayRef[Str]]
+        Indicate parameters to be fixed during training. Parameters in this array ref will not allocate
+        space for gradient, nor do gradient calculation.
+    grad_req : ArrayRef[GradReq]|HashRef[GradReq]|GradReq
+        Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
+        (default to 'write').
+        Can be specified globally (str) or for each argument (array ref, hash ref).
+    state_names: Maybe[ArrayRef[Str]]
+=cut
+
+has 'symbol'            => (is => 'ro', isa => 'AI::MXNet::Symbol', required => 1);
+has 'contexts'          => (is => 'ro', isa => 'ArrayRef[AI::MXNet::Context]', required => 1);
+has 'workload'          => (is => 'ro', isa => 'ArrayRef[Num]', default => sub { [] });
+has 'data_shapes'       => (is => 'rw', isa => 'ArrayRef[NameShape|AI::MXNet::DataDesc]', required => 1);
+has 'label_shapes'      => (is => 'rw', isa => 'Maybe[ArrayRef[NameShape|AI::MXNet::DataDesc]]');
+has 'param_names'       => (is => 'ro', isa => 'ArrayRef[Str]', required => 1);
+has 'for_training'      => (is => 'ro', isa => 'Bool', required => 1);
+has 'inputs_need_grad'  => (is => 'ro', isa => 'Bool', default  => 0);
+has 'shared_group'      => (is => 'ro', isa => 'Maybe[AI::MXNet::DataParallelExecutorGroup]');
+has 'logger'            => (is => 'ro', default => sub { AI::MXNet::Logging->get_logger });
+has 'fixed_param_names' => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
+has 'state_names'       => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
+has 'grad_req'          => (is => 'rw', isa => 'ArrayRef[GradReq]|HashRef[GradReq]|GradReq', default=>'write');
+has '_p'                => (is => 'rw', init_arg => undef);
+sub BUILD
+{
+    my $self = shift;
+    my $p = AI::MXNet::DataParallelExecutorGroup::_private->new;
+    $p->arg_names($self->symbol->list_arguments);
+    $p->aux_names($self->symbol->list_auxiliary_states);
+    $p->execs([]);
+    $self->_p($p);
+    $self->grad_req('null') if not $self->for_training;
+    $self->fixed_param_names([]) unless defined $self->fixed_param_names;
+    $self->state_names([]) unless defined $self->state_names;
+    my $data_shapes = [];
+    for my $d (@{ $self->data_shapes })
+    {
+        $d = AI::MXNet::DataDesc->new(name => $d->[0], shape => $d->[1])
+            unless blessed $d;
+        push @{ $data_shapes }, $d;
+    }
+    $self->data_shapes($data_shapes);
+    if(defined $self->label_shapes)
+    {
+        my $label_shapes = [];
+        for my $l (@{ $self->label_shapes })
+        {
+            $l = AI::MXNet::DataDesc->new(name => $l->[0], shape => $l->[1])
+                unless blessed $l;
+            push @{ $label_shapes }, $l;
+        }
+        $self->label_shapes($label_shapes);
+    }
+    my %data_names  = map { $_->name => 1 } @{ $self->data_shapes };
+    my %param_names = map { $_    =>    1 } @{ $self->param_names };
+    my %fixed_param_names = map { $_ => 1 } @{ $self->fixed_param_names };
+    my %grad_req;
+    if(not ref $self->grad_req)
+    {
+        for my $k (@{ $self->_p->arg_names })
+        {
+            if(exists $param_names{ $k })
+            {
+                $grad_req{$k} = exists $fixed_param_names{ $k } ? 'null' : $self->grad_req;
+            }
+            elsif(exists $data_names{ $k })
+            {
+                $grad_req{$k} = $self->inputs_need_grad ? $self->grad_req : 'null';
+            }
+            else
+            {
+                $grad_req{$k} = 'null';
+            }
+        }
+    }
+    elsif(ref $self->grad_req eq 'ARRAY')
+    {
+        @grad_req{ @{ $self->_p->arg_names } } = @{ $self->grad_req };
+    }
+    else
+    {
+        for my $k (@{ $self->_p->arg_names })
+        {
+            if(exists $param_names{ $k })
+            {
+                $grad_req{$k} = exists $fixed_param_names{ $k } ? 'null' : 'write';
+            }
+            elsif(exists $data_names{ $k })
+            {
+                $grad_req{$k} = $self->inputs_need_grad ? 'write' : 'null';
+            }
+            else
+            {
+                $grad_req{$k} = 'null';
+            }
+        }
+        %grad_req = (%grad_req, %{ $self->grad_req });
+    }
+    $self->grad_req(\%grad_req);
+    if(defined $self->shared_group)
+    {
+        $self->_p->shared_data_arrays($self->shared_group->_p->shared_data_arrays);
+    }
+    else
+    {
+        $self->_p->shared_data_arrays([map { +{} } 0..@{ $self->contexts }-1]);
+    }
+    $self->_p->output_layouts([
+        map {
+            AI::MXNet::DataDesc->get_batch_axis($self->symbol->slice($_)->attr('__layout__'))
+        } @{ $self->symbol->list_outputs }
+    ]);
+    $self->bind_exec($self->data_shapes, $self->label_shapes, $self->shared_group);
+}
+
+=decide_slices
+
+Decide the slices for each context according to the workload.
+
+Parameters
+----------
+$data_shapes : ArrayRef[AI::MXNet::DataDesc]
+=cut
+
+method decide_slices(ArrayRef[AI::MXNet::DataDesc] $data_shapes)
+{
+    confess("empty data_shapes array") unless @{ $data_shapes } > 0;
+    my $major_axis = [map { AI::MXNet::DataDesc->get_batch_axis($_->layout) } @{ $data_shapes }];
+    zip(sub {
+        my ($desc, $axis) = @_;
+        return if($axis == -1);
+        my $batch_size = $desc->shape->[$axis];
+        if(defined $self->_p->batch_size)
+        {
+            confess(
+                "all data must have the same batch size: "
+                . sprintf("batch_size = %d, but ", $self->_p->batch_size)
+                . sprintf("%s has shape %s", $desc->name, '('. join(',', @{ $desc->shape }) . ')')
+            ) unless $batch_size == $self->_p->batch_size;
+        }
+        else
+        {
+            $self->_p->batch_size($batch_size);
+            $self->_p->slices(AI::MXNet::Executor::Group::_split_input_slice($self->_p->batch_size, $self->workload));
+        }
+    }, $data_shapes, $major_axis);
+    return $major_axis;
+}
+
+# Collect internal arrays from executors.
+method _collect_arrays()
+{
+    # convenient data structures
+    $self->_p->data_arrays([]);
+    for my $d (@{ $self->data_shapes })
+    {
+        my $name = $d->name;
+        my @tmp;
+        for my $i (0..@{ $self->_p->execs }-1)
+        {
+            push @tmp, [ $self->_p->slices->[$i], $self->_p->execs->[$i]->arg_dict->{$name} ];
+        }
+        push @{ $self->_p->data_arrays }, \@tmp;
+    }
+    if(defined $self->label_shapes)
+    {
+        $self->_p->label_arrays([]);
+        for my $l (@{ $self->label_shapes })
+        {
+            my $name = $l->name;
+            my @tmp;
+            for my $i (0..@{ $self->_p->execs }-1)
+            {
+                push @tmp, [ $self->_p->slices->[$i], $self->_p->execs->[$i]->arg_dict->{$name} ];
+            }
+            push @{ $self->_p->label_arrays }, \@tmp;
+        }
+    }
+    $self->_p->param_arrays([]);
+    my %param_names = map { $_ => 1 } @{ $self->param_names };
+    for my $i (0..@{ $self->_p->arg_names }-1)
+    {
+        my $name = $self->_p->arg_names->[$i];
+        if(exists $param_names{$name})
+        {
+            my @tmp;
+            for my $exec (@{ $self->_p->execs })
+            {
+                push @tmp, $exec->arg_arrays->[$i];
+            }
+            push @{ $self->_p->param_arrays }, \@tmp;
+        }
+    }
+    $self->_p->state_arrays([]);
+    for my $i (0..@{ $self->state_names }-1)
+    {
+        my $name = $self->state_names->[$i];
+        my @tmp;
+        for my $exec (@{ $self->_p->execs })
+        {
+            push @tmp, $exec->arg_dict->{$name};
+        }
+        push @{ $self->_p->state_arrays }, \@tmp;
+    }
+    if($self->for_training)
+    {
+        $self->_p->grad_arrays([]);
+        for my $i (0..@{ $self->_p->arg_names }-1)
+        {
+            my $name = $self->_p->arg_names->[$i];
+            if(exists $param_names{$name})
+            {
+                my @tmp;
+                for my $exec (@{ $self->_p->execs })
+                {
+                    push @tmp, $exec->grad_arrays->[$i];
+                }
+                push @{ $self->_p->grad_arrays }, \@tmp;
+            }
+        }
+    }
+    my %data_names = map { $_->name => 1 } @{ $self->data_shapes };
+    if($self->inputs_need_grad)
+    {
+        $self->_p->input_grad_arrays([]);
+        for my $i (0..@{ $self->_p->arg_names }-1)
+        {
+            my $name = $self->_p->arg_names->[$i];
+            if(exists $data_names{$name})
+            {
+                my @tmp;
+                for my $exec (@{ $self->_p->execs })
+                {
+                    push @tmp, $exec->grad_arrays->[$i];
+                }
+                push @{ $self->_p->input_grad_arrays }, \@tmp;
+            }
+        }
+    }
+    $self->_p->aux_arrays([]);
+    for my $i (0..@{ $self->_p->aux_names }-1)
+    {
+        my @tmp;
+        for my $exec (@{ $self->_p->execs })
+        {
+            push @tmp, $exec->aux_arrays->[$i];
+        }
+        push @{ $self->_p->aux_arrays }, \@tmp;
+    }
+}
+
+=method bind_exec
+
+Bind executors on their respective devices.
+
+Parameters
+----------
+$data_shapes  : ArrayRef[AI::MXNet::DataDesc]
+$label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc]]
+$shared_group : Maybe[AI::MXNet::DataParallelExecutorGroup]
+$reshape      : Bool
+=cut
+
+method bind_exec(
+    ArrayRef[AI::MXNet::DataDesc]               $data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc]]        $label_shapes=,
+    Maybe[AI::MXNet::DataParallelExecutorGroup] $shared_group=,
+    Bool                                        $reshape=0
+)
+{
+    assert($reshape or not @{ $self->_p->execs });
+    $self->_p->batch_size(undef);
+
+    # calculate workload and bind executors
+    $self->_p->data_layouts($self->decide_slices($data_shapes));
+    # call it to make sure labels has the same batch size as data
+    if(defined $label_shapes)
+    {
+        $self->_p->label_layouts($self->decide_slices($label_shapes));
+    }
+
+    for my $i (0..@{ $self->contexts }-1)
+    {
+        my $data_shapes_i = $self->_sliced_shape($data_shapes, $i, $self->_p->data_layouts);
+        my $label_shapes_i = [];
+        if(defined $label_shapes)
+        {
+            $label_shapes_i = $self->_sliced_shape($label_shapes, $i, $self->_p->label_layouts);
+        }
+        if($reshape)
+        {
+            my %combined_hash = map { $_->name => $_->shape } (@{ $data_shapes_i }, @{ $label_shapes_i });
+            $self->_p->execs->[$i] = $self->_p->_default_execs->[$i]->reshape(
+                \%combined_hash,
+                allow_up_sizing => 1,
+            );
+        }
+        else
+        {
+            push @{ $self->_p->execs }, $self->_bind_ith_exec($i, $data_shapes_i, $label_shapes_i, $shared_group);
+        }
+    }
+    $self->data_shapes($data_shapes);
+    $self->label_shapes($label_shapes);
+    $self->_collect_arrays;
+}
+
+=head2 reshape
+
+Reshape executors.
+
+Parameters
+----------
+$data_shapes : ArrayRef[AI::MXNet::DataDesc]
+$label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc]]
+=cut
+
+
+method reshape(
+    ArrayRef[AI::MXNet::DataDesc]          $data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc]]   $label_shapes=
+)
+{
+    return if($data_shapes eq $self->data_shapes and $label_shapes eq $self->label_shapes);
+    if (not defined $self->_p->_default_execs)
+    {
+        $self->_p->_default_execs([@{ $self->_p->execs }]);
+    }
+    $self->bind_exec($data_shapes, $label_shapes, undef, 1);
+}
+
+=head set_params
+
+Assign, i.e. copy parameters to all the executors.
+
+Parameters
+----------
+$arg_params : HashRef[AI::MXNet::NDArray]
+    A dictionary of name to AI::MXNet::NDArray parameter mapping.
+$aux_params : HashRef[AI::MXNet::NDArray]
+    A dictionary of name to AI::MXNet::NDArray auxiliary variable mapping.
+=cut
+
+method set_params(HashRef[AI::MXNet::NDArray] $arg_params, HashRef[AI::MXNet::NDArray] $aux_params)
+{
+    $_->copy_params_from($arg_params, $aux_params) for @{ $self->_p->execs };
+}
+
+=head2 get_params
+
+Copy data from each executor to arg_params and aux_params.
+
+Parameters
+----------
+$arg_params : HashRef[AI::MXNet::NDArray]
+    target parameter arrays
+$aux_params : HashRef[AI::MXNet::NDArray]
+    target aux arrays
+
+Notes
+-----
+- This function will inplace update the NDArrays in arg_params and aux_params.
+=cut
+
+method get_params(HashRef[AI::MXNet::NDArray] $arg_params, HashRef[AI::MXNet::NDArray] $aux_params)
+{
+    my $weight = 0;
+    zip(sub {
+        my ($name, $block) = @_;
+            my $weight = sum(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
+            $weight->astype($arg_params->{$name}->dtype)->copyto($arg_params->{$name});
+    }, $self->param_names, $self->_p->param_arrays);
+    zip(sub {
+        my ($name, $block) = @_;
+            my $weight = sum(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
+            $weight->astype($aux_params->{$name}->dtype)->copyto($aux_params->{$name});
+    }, $self->_p->aux_names, $self->_p->aux_arrays);
+}
+
+
+
+method get_states($merge_multi_context=1)
+{
+    assert((not $merge_multi_context), "merge_multi_context=True is not supported for get_states yet.");
+    return $self->_p->state_arrays;
+}
+
+method set_states($states, $value)
+{
+    if(defined $states)
+    {
+        assert((not defined $value), "Only one of states & value can be specified.");
+        AI::MXNet::Executor::Group::_load_general($states, $self->_p->state_arrays, [(0)x@{ $states }]);
+    }
+    else
+    {
+        assert((defined $value), "At least one of states & value must be specified.");
+        assert((not defined $states), "Only one of states & value can be specified.");
+        for my $d_dst (@{ $self->_p->state_arrays })
+        {
+            for my $dst (@{ $d_dst })
+            {
+                $dst .= $value;
+            }
+        }
+    }
+}
+
+=head2 forward
+
+Split the data_batch according to a workload and run forward on each devices.
+
+Parameters
+----------
+data_batch : AI::MXNet::DataBatch
+Or could be any object implementing similar interface.
+
+is_train : bool
+The hint for the backend, indicating whether we are during training phase.
+Default is undef, then the value $self->for_training will be used.
+=cut
+
+
+method forward(AI::MXNet::DataBatch $data_batch, Maybe[Bool] $is_train=)
+{
+    AI::MXNet::Executor::Group::_load_data($data_batch, $self->_p->data_arrays, $self->_p->data_layouts);
+    $is_train //= $self->for_training;
+    if(defined $self->_p->label_arrays)
+    {
+        confess("assert not is_train or data_batch.label")
+            unless (not $is_train or $data_batch->label);
+        if($data_batch->label)
+        {
+            AI::MXNet::Executor::Group::_load_label($data_batch, $self->_p->label_arrays, $self->_p->label_layouts);
+        }
+    }
+    $_->forward($is_train) for @{ $self->_p->execs };
+}
+
+# Get the shapes of the outputs
+
+method get_output_shapes()
+{
+    my @shapes = map { $_->shape } @{ $self->execs->[0]->outputs };
+    my @concat_shapes;
+    zip(sub {
+        my ($key, $shape, $axis) = @_;
+        my @the_shape = @{ $shape };
+        if($axis >= 0)
+        {
+            $the_shape[$axis] = $self->_p->batch_size;
+        }
+        push @concat_shapes, AI::MXNet::DataDesc->new(name => $key, shape => \@the_shape);
+    }, $self->symbol->list_outputs, \@shapes, $self->_p->output_layouts);
+    return \@concat_shapes;
+}
+
+=head2 get_outputs
+
+Gets outputs of the previous forward computation.
+
+Parameters
+----------
+merge_multi_context : bool
+Default is 1. In the case when data-parallelism is used, the outputs
+will be collected from multiple devices. A 1 value indicates that we
+should merge the collected results so that they look like from a single
+executor.
+
+Returns
+-------
+If merge_multi_context is 1, it is [$out1, $out2]. Otherwise, it
+is [[$out1_dev1, $out1_dev2], [$out2_dev1, $out2_dev2]]. All the output
+elements are `AI::MXNet::NDArray`.
+=cut
+
+method get_outputs(Bool $merge_multi_context=1)
+{
+    my $outputs;
+    for my $i (0..@{ $self->_p->execs->[0]->outputs }-1)
+    {
+        my @tmp;
+        for my $exec (@{ $self->_p->execs })
+        {
+            push @tmp, $exec->outputs->[$i];
+        }
+        push @$outputs, \@tmp;
+    }
+    if($merge_multi_context)
+    {
+        $outputs = AI::MXNet::Executor::Group::_merge_multi_context($outputs, $self->_p->output_layouts);
+    }
+    return $outputs;
+}
+
+=head2  get_input_grads
+
+Get the gradients with respect to the inputs of the module.
+
+Parameters
+----------
+merge_multi_context : bool
+Default is 1. In the case when data-parallelism is used, the outputs
+will be collected from multiple devices. A 1 value indicates that we
+should merge the collected results so that they look like from a single
+executor.
+
+Returns
+-------
+If merge_multi_context is 1, it is [$grad1, $grad2]. Otherwise, it
+is [[$grad1_dev1, $grad1_dev2], [$grad2_dev1, $grad2_dev2]]. All the output
+elements are AI::MXNet::NDArray.
+=cut
+
+method get_input_grads(Bool $merge_multi_context=1)
+{
+    confess("assert \$self->inputs_need_grad") unless $self->inputs_need_grad;
+    if($merge_multi_context)
+    {
+        return AI::MXNet::Executor::Group::_merge_multi_context($self->_p->input_grad_arrays, $self->_p->data_layouts);
+    }
+    return $self->_p->input_grad_arrays;
+}
+
+=head2 backward
+
+Run backward on all devices. A backward should be called after
+a call to the forward function. Backward cannot be called unless
+$self->for_training is 1.
+
+Parameters
+----------
+out_grads : NDArray or array ref of NDArray, optional
+Gradient on the outputs to be propagated back.
+This parameter is only needed when bind is called
+on outputs that are not a loss function.
+=cut
+
+method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grads=)
+{
+    confess('re-bind with for_training=1 to run backward') unless $self->for_training;
+    $out_grads //= [];
+    zip(sub {
+        my ($i, $exec, $islice) = @_;
+        my @out_grads_slice;
+        zip(sub{
+            my ($grad, $axis) = @_;
+            if($axis >= 0)
+            {
+                my $og_my_slice = $grad->slice_axis({
+                    axis  => $axis,
+                    begin => $islice->[0],
+                    end   => $islice->[1]
+                });
+                push @out_grads_slice, $og_my_slice->as_in_context($self->contexts->[$i]);
+            }
+            else
+            {
+                push @out_grads_slice, $grad->copyto($self->contexts->[$i]);
+            }
+        }, $out_grads, $self->_p->output_layouts);
+        $exec->backward(\@out_grads_slice);
+    }, [0..@{ $self->_p->execs }-1], $self->_p->execs, $self->_p->slices);
+}
+
+=head2 update_metric
+
+Accumulate the performance according to eval_metric on all devices.
+
+Parameters
+----------
+eval_metric : AI::MXNet::EvalMetric
+    The metric used for evaluation.
+labels : array ref of NDArray
+    Typically comes from label of AI::MXNet::DataBatch.
+=cut
+
+method update_metric(AI::MXNet::EvalMetric $eval_metric, ArrayRef[AI::MXNet::NDArray] $labels)
+{
+    zip(sub {
+        my ($texec, $islice) = @_;
+        my @labels_slice;
+        zip(sub {
+            my ($label, $axis) = @_;
+            if($axis == 0)
+            {
+                # slicing NDArray along axis 0 can avoid copying
+                push @labels_slice, $label->slice([$islice->[0], $islice->[1]-1]);
+            }
+            elsif($axis > 0)
+            {
+                my $label_my_slice = $label->slice_axis({
+                    axis  => $axis,
+                    begin => $islice->[0],
+                    end   => $islice->[1]
+                })->as_in_context($label->context);
+                push @labels_slice, $label_my_slice;
+            }
+            else
+            {
+                push @labels_slice, $label;
+            }
+        }, $labels, $self->_p->label_layouts);
+        $eval_metric->update(\@labels_slice, $texec->outputs);
+    }, $self->_p->execs, $self->_p->slices);
+}
+
+# Internal utility function to bind the i-th executor.
+
+method _bind_ith_exec(
+    Int                                         $i,
+    ArrayRef[AI::MXNet::DataDesc]               $data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc]]        $label_shapes,
+    Maybe[AI::MXNet::DataParallelExecutorGroup] $shared_group
+)
+{
+    my $shared_exec = $shared_group ? $shared_group->_p->execs->[$i] : undef;
+    my $context = $self->contexts->[$i];
+    my $shared_data_arrays = $self->_p->shared_data_arrays->[$i];
+    my %input_shapes = map { $_->name => $_->shape } @{ $data_shapes };
+    if(defined $label_shapes)
+    {
+        %input_shapes = (%input_shapes, map { $_->name => $_->shape } @{ $label_shapes });
+    }
+    my ($arg_shapes, undef, $aux_shapes) = $self->symbol->infer_shape(%input_shapes);
+    confess("shape inference failed") unless defined $arg_shapes;
+
+    my %input_types = map { $_->name => $_->dtype } @{ $data_shapes };
+    my ($arg_types, undef, $aux_types) = $self->symbol->infer_type(%input_types);
+    confess("type inference failed") unless defined $arg_types;
+    my $arg_arrays = [];
+    my $grad_arrays = $self->for_training ? {} : undef;
+
+    #Internal helper to get a memory block or re-use by re-shaping
+    my $_get_or_reshape = sub {
+            my ($name, $shared_data_arrays, $arg_shape, $arg_type, $context, $logger) = @_;
+            my $arg_arr;
+            if(exists $shared_data_arrays->{$name})
+            {
+                $arg_arr = $shared_data_arrays->{$name};
+                if(product(@{ $arg_arr->shape }) >= product(@{ $arg_shape }))
+                {
+                    # nice, we can directly re-use this data blob
+                    confess("dtypes do not match") 
+                        unless $arg_arr->dtype eq $arg_type;
+                    $arg_arr = $arg_arr->reshape($arg_shape);
+                }
+                else
+                {
+                    $logger->warning(
+                        'bucketing: data "%s" has a shape (%s)'
+                        .', which is larger than already allocated '
+                        .'shape (%s)'
+                        .'. Need to re-allocate. Consider putting '
+                        .'default_bucket_key to'
+                        .' be the bucket taking the largest input for better '
+                        .'memory sharing.',
+                        $name, join(',', $arg_shape), join(',', $arg_arr->shape)
+                    );
+                    $arg_arr = AI::MXNet::NDArray->zeros(
+                        $arg_shape,
+                        ctx => $context,
+                        dtype => $arg_type
+                    );
+                    # replace existing shared array because the new one is bigger
+                    $shared_data_arrays->{ $name } = $arg_arr;
+                }
+            }
+            else
+            {
+                $arg_arr = AI::MXNet::NDArray->zeros(
+                    $arg_shape,
+                    ctx => $context,
+                    dtype => $arg_type
+                );
+                $shared_data_arrays->{ $name } = $arg_arr;
+            }
+            return $arg_arr;
+    };
+    my %param_names = map { $_ => 1 } @{ $self->param_names };
+    # create or borrow arguments and gradients
+    for my $j (0..@{ $self->_p->arg_names }-1)
+    {
+        my $name = $self->_p->arg_names->[$j];
+        my $arg_arr;
+        if(exists $param_names{ $name }) # model parameter
+        {
+            if(not defined $shared_exec)
+            {
+                $arg_arr = AI::MXNet::NDArray->zeros(
+                    $arg_shapes->[$j],
+                    ctx   => $context,
+                    dtype => $arg_types->[$j]
+                );
+
+                if($self->grad_req->{$name} ne 'null')
+                {
+                    my $grad_arr = AI::MXNet::NDArray->zeros(
+                        $arg_shapes->[$j],
+                        ctx   => $context,
+                        dtype => $arg_types->[$j]
+                    );
+                    $grad_arrays->{ $name } = $grad_arr;
+                }
+            }
+            else
+            {
+                $arg_arr = $shared_exec->arg_dict->{ $name };
+                my $arg_arr_shape = $arg_arr->shape;
+                my $arg_shape = $arg_shapes->[$j];
+                confess "shapes do not match (@$arg_arr_shape) != (@$arg_shape)"
+                    unless "@$arg_arr_shape" eq "@$arg_shape";
+                my $arg_arr_type = $arg_arr->dtype;
+                my $arg_type = $arg_types->[$j];
+                confess "types do not match $arg_arr_type) != $arg_type"
+                    unless $arg_arr_type eq $arg_type;
+                if($self->grad_req->{ $name } ne 'null')
+                {
+                    $grad_arrays->{ $name } = $shared_exec->grad_dict->{ $name };
+                }
+            }
+        }
+        else # data or label
+        {
+            $arg_arr = $_get_or_reshape->(
+                $name, $shared_data_arrays, $arg_shapes->[$j],
+                $arg_types->[$j], $context, $self->logger
+            );
+            if($self->grad_req->{ $name } ne 'null')
+            {
+                $grad_arrays->{ $name } = $_get_or_reshape->(
+                    "grad of $name", $shared_data_arrays,
+                    $arg_shapes->[$j], $arg_types->[$j],
+                    $context, $self->logger
+                );
+            }
+        }
+        # data might also need grad if inputs_need_grad is True
+        push @{ $arg_arrays }, $arg_arr;
+    }
+    # create or borrow aux variables
+    my $aux_arrays = [];
+    if(not defined $shared_exec)
+    {
+        zip(sub{
+            my ($s, $t) = @_;
+            push @{ $aux_arrays }, AI::MXNet::NDArray->zeros($s, ctx => $context, dtype => $t);
+        }, $aux_shapes, $aux_types);
+    }
+    else
+    {
+        for my $j (0..@{ $shared_exec->aux_arrays }-1)
+        {
+            my $arr = $shared_exec->aux_arrays->[$j];
+            my $aux_shape = $aux_shapes->[$j];
+            my $arr_shape = $arr->shape;
+            confess("aux shape (@$aux_shape) != array shape (@$arr_shape)")
+                unless "@$aux_shape" eq "@$arr_shape";
+            my $aux_type = $aux_types->[$j];
+            my $arr_type = $arr->dtype;
+            confess("aux_type $aux_type != array type $arr_type")
+                unless $aux_type ne $arr_type;
+        }
+        @{ $aux_arrays } = @{ $shared_exec->aux_arrays };
+    }
+    my $executor = $self->symbol->bind(
+        ctx => $context, args => $arg_arrays,
+        args_grad => $grad_arrays, aux_states => $aux_arrays,
+        grad_req => $self->grad_req, shared_exec => $shared_exec
+    );
+    return $executor;
+}
+
+=head2 _sliced_shape
+
+Get the sliced shapes for the i-th executor.
+
+Parameters
+----------
+shapes : array ref of (str, array ref)
+    The original (name, shape) pairs.
+i : int
+Which executor we are dealing with.
+=cut
+
+method _sliced_shape(ArrayRef[AI::MXNet::DataDesc] $shapes, Int $i, ArrayRef[Int] $major_axis)
+{
+    my @sliced_shapes;
+    zip(sub {
+        my ($desc, $axis) = @_;
+        my @shape = @{ $desc->shape };
+        if($axis >= 0)
+        {
+            $shape[$axis] = $self->_p->slices->[$i]->[1] - $self->_p->slices->[$i]->[0];
+        }
+        push @sliced_shapes, AI::MXNet::DataDesc->new(
+            name    => $desc->name,
+            shape   => \@shape,
+            dtype   => $desc->dtype,
+            layout  => $desc->layout
+        );
+    }, $shapes, $major_axis);
+    return \@sliced_shapes;
+}
+
+=head2 install_monitor
+
+Install monitor on all executors
+
+Parameters
+----------
+$mon : AI::MXNet::Monitor
+=cut
+
+method install_monitor(AI::MXNet::Monitor $mon)
+{
+    $mon->install($_) for @{ $self->_p->execs };
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Function/Parameters.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Function/Parameters.pm
new file mode 100644
index 000000000000..e2a6e2cf576b
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Function/Parameters.pm
@@ -0,0 +1,33 @@
+package AI::MXNet::Function::Parameters;
+use Function::Parameters ();
+use AI::MXNet::Types ();
+sub import {
+    Function::Parameters->import(
+        {
+            func => {
+                defaults => 'function_strict',
+                runtime  => 1,
+                reify_type => sub {
+                    Mouse::Util::TypeConstraints::find_or_create_isa_type_constraint($_[0])
+                }
+            },
+            method => {
+                defaults => 'method_strict',
+                runtime  => 1,
+                reify_type => sub {
+                    Mouse::Util::TypeConstraints::find_or_create_isa_type_constraint($_[0])
+                }
+            },
+        }
+    );
+}
+
+{
+    no warnings 'redefine';
+    *Function::Parameters::_croak = sub {
+        local($Carp::CarpLevel) = 1;
+        Carp::confess ("@_");
+    };
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
new file mode 100644
index 000000000000..f53a3261828b
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
@@ -0,0 +1,795 @@
+package AI::MXNet::IO;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use Scalar::Util qw/blessed/;
+
+=head1 NAME
+
+AI::MXNet::IO - NDArray interface of mxnet.
+=cut
+
+# Convert data into canonical form.
+method init_data(
+    AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Undef $data,
+    Undef|Int :$allow_empty=,
+    Str :$default_name
+)
+{
+    Carp::confess("data must be defined or allow_empty set to true value")
+        if(not defined $data and not $allow_empty);
+    $data //= [];
+
+    if(ref($data) and ref($data) ne 'ARRAY' and ref($data) ne 'HASH')
+    {
+        $data = [$data];
+    }
+
+    Carp::confess("data must not be empty or allow_empty set to true value")
+        if(ref($data) eq 'ARRAY' and not @{ $data } and not $allow_empty);
+
+    my @ret;
+    if(ref($data) eq 'ARRAY')
+    {
+        if(@{ $data } == 1)
+        {
+            @ret = ([$default_name, $data->[0]]);
+        }
+        else
+        {
+            my $i = -1;
+            @ret = map { $i++; ["_${i}_$default_name", $_] } @{ $data };
+        }
+    }
+    if(ref($data) eq 'HASH')
+    {
+        while(my ($k, $v) = each %{ $data })
+        {
+            push @ret, [$k, $v];
+        }
+    }
+    for my $d (@ret)
+    {
+        if(not (blessed $d->[1] and $d->[1]->isa('AI::MXNet::NDArray')))
+        {
+            $d->[1] = AI::MXNet::NDArray->array($d->[1]);
+        }
+    }
+    return \@ret;
+}
+
+method DataDesc(@args)  { AI::MXNet::DataDesc->new(@args)  }
+method DataBatch(@args) { AI::MXNet::DataBatch->new(@args) }
+
+package AI::MXNet::DataDesc;
+use Mouse;
+use overload '""'  => \&stringify,
+             '@{}' => \&to_nameshape;
+has 'name'   => (is => 'ro', isa => "Str",   required => 1);
+has 'shape'  => (is => 'ro', isa => "Shape", required => 1);
+has 'dtype'  => (is => 'ro', isa => "Dtype", default => 'float32');
+has 'layout' => (is => 'ro', isa => "Str",   default => 'NCHW');
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_ >= 2 and ref $_[1] eq 'ARRAY')
+    {
+        my $name  = shift;
+        my $shape = shift;
+        return $class->$orig(name => $name, shape => $shape, @_);
+    }
+    return $class->$orig(@_);
+};
+
+method stringify($other=, $reverse=)
+{
+    sprintf(
+        "DataDesc[%s,%s,%s,%s]",
+        $self->name,
+        join('x', @{ $self->shape }),
+        $self->dtype,
+        $self->layout
+    );
+}
+
+method to_nameshape($other=, $reverse=)
+{
+    [$self->name, $self->shape];
+}
+
+=head1 NAME
+
+AI::MXNet::DataDesc - A container class for describing the data layout.
+=cut
+
+=head2 get_batch_axis
+
+Get the dimension that corresponds to the batch size.
+
+Parameters
+----------
+layout : str
+    layout string. For example, "NCHW".
+
+Returns
+-------
+An axis indicating the batch_size dimension. When data-parallelism is
+used, the data will be automatically split and concatenate along the batch_size
+dimension. Axis can be -1, which means the whole array will be copied for each
+data-parallelism device.
+=cut
+
+method get_batch_axis(Str|Undef $layout)
+{
+    return 0 unless defined $layout;
+    return index($layout, 'N');
+}
+
+=head2 get_list
+
+Coverts the input to an array ref AI::MXNet::DataDesc objects.
+
+Parameters
+----------
+$shapes : HashRef[Shape]
+$types :  Maybe[HashRef[Dtype]]
+=cut
+
+method get_list(HashRef[Shape] $shapes, Maybe[HashRef[Dtype]] $types=)
+{
+    $types //= {};
+    return [
+        map {
+            AI::MXNet::DataDesc->new(
+                name  => $_,
+                shape => $shapes->{$_},
+                (exists $types->{$_} ? (type => $types->{$_}) : ())
+            )
+        } keys %{ $shapes }
+    ];
+}
+
+package AI::MXNet::DataBatch;
+use Mouse;
+
+=head1 NAME
+
+AI::MXNet::DataBatch - A container for a mini-batch of the data and related information.
+=cut
+
+=head1 DESCRIPTION
+
+Default object for holding a mini-batch of data and related information.
+=cut
+
+has 'data'          => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]', required => 1);
+has 'label'         => (is => 'rw', isa => 'Maybe[ArrayRef[AI::MXNet::NDArray]]');
+has 'pad'           => (is => 'rw');
+has 'index'         => (is => 'rw');
+has 'bucket_key'    => (is => 'rw');
+has 'provide_data'  => (is => 'rw');
+has 'provide_label' => (is => 'rw');
+
+package AI::MXNet::DataIter;
+use Mouse;
+use overload '<>' =>  sub { shift->next },
+             '@{}' => sub { shift->list };
+
+=head1 NAME
+
+AI::MXNet::DataIter - A parent class for MXNet data iterators.
+=cut
+
+has 'batch_size' => (is => 'rw', isa => 'Int', default => 0);
+
+=head2 reset
+
+Reset the iterator.
+=cut
+
+method reset(){}
+
+=head2 list
+
+Returns remaining iterator items as an array ref.
+=cut
+
+method list()
+{
+    my @ret;
+    while(<$self>)
+    {
+        push @ret, $_;
+    }
+    return \@ret;
+}
+
+=head2 next
+
+Returns the next data batch from the iterator.
+
+Returns
+-------
+$data : AI::MXNet::DataBatch
+The data of next batch.
+=cut
+
+method next()
+{
+    if($self->iter_next())
+    {
+        return AI::MXNet::DataBatch->new(
+            data  => $self->getdata,
+            label => $self->getlabel,
+            pad   => $self->getpad,
+            index => $self->getindex
+        );
+    }
+    else
+    {
+        return undef;
+    }
+}
+
+=head2 iter_next
+
+Iterate to next batch.
+
+Returns
+-------
+$has_next : Bool
+=cut
+
+method iter_next(){}
+
+=head2 get_data
+
+The data of current batch.
+
+Returns
+-------
+data : AI::MXNet::NDArray
+=cut
+
+method get_data(){}
+
+=head2 getlabel
+
+The label of the current batch.
+
+Returns
+-------
+label : AI::MXNet::NDArray
+=cut
+
+method getlabel(){}
+
+=head2 getindex
+
+The index of the current batch.
+
+Returns
+-------
+$index : PDL
+=cut
+
+method getindex(){}
+
+=head2 getpad
+
+The number of padding examples in the current batch.
+
+Returns
+-------
+$pad : Int
+=cut
+
+method getpad(){}
+
+package AI::MXNet::ResizeIter;
+use Mouse;
+
+extends 'AI::MXNet::DataIter';
+
+=head1 NAME
+
+AI::MXNet::ResizeIter
+=cut
+
+=head1 DESCRIPTION
+
+Resize a DataIter to given number of batches per epoch.
+May produce incomplete batch in the middle of an epoch due
+to padding from internal iterator.
+
+Parameters
+----------
+data_iter : DataIter
+    Internal data iterator.
+size : number of batches per epoch to resize to.
+reset_internal : whether to reset internal iterator on ResizeIter.reset
+=cut
+
+has 'data_iter'      => (is => 'ro', isa => 'AI::MXnet::DataIter', required => 1);
+has 'size'           => (is => 'ro', isa => 'Int', required => 1);
+has 'reset_internal' => (is => 'rw', isa => 'Int', default => 1);
+has 'cur'            => (is => 'rw', isa => 'Int', default => 0);
+has 'current_batch'  => (is => 'rw', isa => 'Maybe[AI::MXNet::DataBatch]');
+has [qw/provide_data
+    default_bucket_key
+    provide_label
+    batch_size/]     => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->provide_data($self->data_iter->provide_data);
+    $self->provide_label($self->data_iter->provide_label);
+    $self->batch_size($self->data_iter->batch_size);
+    if($self->data_iter->can('default_bucket_key'))
+    {
+        $self->default_bucket_key($self->data_iter->default_bucket_key);
+    }
+}
+
+method reset()
+{
+    $self->cur(0);
+    if($self->reset_internal)
+    {
+        $self->data_iter->reset;
+    }
+}
+
+method iter_next()
+{
+    return 0 if($self->cur == $self->size);
+    $self->current_batch($self->data_iter->next);
+    if(not defined $self->current_batch)
+    {
+        $self->data_iter->reset;
+        $self->current_batch($self->data_iter->next);
+    }
+    $self->cur($self->cur + 1);
+    return 1;
+}
+
+method get_data()
+{
+    return $self->current_batch->data;
+}
+
+method getlabel()
+{
+    return $self->current_batch->label;
+}
+
+method getindex()
+{
+    return $self->current_batch->index;
+}
+
+method getpad()
+{
+    return $self->current_batch->pad;
+}
+
+package AI::MXNet::NDArrayIter;
+use Mouse;
+use AI::MXNet::Base;
+use List::Util qw(shuffle);
+extends 'AI::MXNet::DataIter';
+
+=head1 NAME
+
+AI::MXNet::NDArrayIter - Predefined NDArray iterator.
+=cut
+
+=head1 DESCRIPTION
+
+Predefined NDArray iterator. Accepts PDL or AI::MXNet::NDArray object as an input.
+
+Parameters
+----------
+data: Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]].
+    NDArrayIter supports single or multiple data and label.
+label: Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]].
+    Same as data, but is not given to the model during testing.
+batch_size=1: Int
+    Batch Size
+shuffle=0: Bool
+    Whether to shuffle the data
+last_batch_handle='pad': 'pad', 'discard' or 'roll_over'
+    How to handle the last batch
+
+Note
+----
+This iterator will pad, discard or roll over the last batch if
+the size of data does not match batch_size. Roll over is intended
+for training and can cause problems if used for prediction.
+=cut
+
+has 'data'                => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]]');
+has 'data_list'           => (is => 'rw', isa => 'ArrayRef[AI::MXNet::NDArray]');
+has 'label'               => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]]');
+has 'batch_size'          => (is => 'rw', isa => 'Int', default => 1);
+has '_shuffle'            => (is => 'rw', init_arg => 'shuffle', isa => 'Bool', default => 0);
+has 'last_batch_handle'   => (is => 'rw', isa => 'Str', default => 'pad');
+has 'label_name'          => (is => 'rw', isa => 'Str', default => 'softmax_label');
+has 'num_source'          => (is => 'rw', isa => 'Int');
+has 'cursor'              => (is => 'rw', isa => 'Int');
+has 'num_data'            => (is => 'rw', isa => 'Int');
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_%2)
+    {
+        my $data  = shift;
+        return $class->$orig(data => $data, @_);
+    }
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self  = shift;
+    my $data  = AI::MXNet::IO->init_data($self->data,  allow_empty => 0, default_name => 'data');
+    my $label = AI::MXNet::IO->init_data($self->label, allow_empty => 1, default_name => $self->label_name);
+    my $num_data  = $data->[0][1]->shape->[0];
+    confess("size of data dimension 0 $num_data < batch_size ${\ $self->batch_size }")
+        unless($num_data >= $self->batch_size);
+    if($self->_shuffle)
+    {
+        my @idx = shuffle(0..$num_data-1);
+        $_->[1] = AI::MXNet::NDArray->array(pdl_shuffle($_->[1]->aspdl, \@idx)) for @$data;
+        $_->[1] = AI::MXNet::NDArray->array(pdl_shuffle($_->[1]->aspdl, \@idx)) for @$label;
+    }
+    if($self->last_batch_handle eq 'discard')
+    {
+        my $new_n = $num_data - $num_data % $self->batch_size - 1;
+        $_->[1] = $_->[1]->slice([0, $new_n]) for @$data;
+        $_->[1] = $_->[1]->slice([0, $new_n]) for @$label;
+    }
+    my $data_list  = [map { $_->[1] } (@{ $data }, @{ $label })];
+    my $num_source = @{ $data_list };
+    my $cursor = -$self->batch_size;
+    $self->data($data);
+    $self->data_list($data_list);
+    $self->label($label);
+    $self->num_source($num_source);
+    $self->cursor($cursor);
+    $self->num_data($num_data);
+}
+
+# The name and shape of data provided by this iterator
+method provide_data()
+{
+    return [map {
+        my ($k, $v) = @{ $_ };
+        my $shape = $v->shape;
+        $shape->[0] = $self->batch_size;
+        AI::MXNet::DataDesc->new(name => $k, shape => $shape, dtype => $v->dtype)
+    } @{ $self->data }];
+}
+
+# The name and shape of label provided by this iterator
+method provide_label()
+{
+    return [map {
+        my ($k, $v) = @{ $_ };
+        my $shape = $v->shape;
+        $shape->[0] = $self->batch_size;
+        AI::MXNet::DataDesc->new(name => $k, shape => $shape, dtype => $v->dtype)
+    } @{ $self->label }];
+}
+
+# Ignore roll over data and set to start
+method hard_reset()
+{
+    $self->cursor(-$self->batch_size);
+}
+
+method reset()
+{
+    if($self->last_batch_handle eq 'roll_over' and $self->cursor > $self->num_data)
+    {
+        $self->cursor(-$self->batch_size + ($self->cursor%$self->num_data)%$self->batch_size);
+    }
+    else
+    {
+        $self->cursor(-$self->batch_size);
+    }
+}
+
+method iter_next()
+{
+    $self->cursor($self->batch_size + $self->cursor);
+    return $self->cursor < $self->num_data;
+}
+
+method next()
+{
+    if($self->iter_next)
+    {
+        return AI::MXNet::DataBatch->new(
+            data  => $self->getdata,
+            label => $self->getlabel,
+            pad   => $self->getpad,
+            index => undef
+        );
+    }
+    else
+    {
+        return undef;
+    }
+}
+
+# Load data from underlying arrays, internal use only
+method _getdata($data_source)
+{
+    confess("DataIter needs reset.") unless $self->cursor < $self->num_data;
+    if(($self->cursor + $self->batch_size) <= $self->num_data)
+    {
+        return [
+            map {
+                $_->[1]->slice([$self->cursor,$self->cursor+$self->batch_size-1])
+            } @{ $data_source }
+        ];
+    }
+    else
+    {
+        my $pad = $self->batch_size - $self->num_data + $self->cursor - 1;
+        return [
+            map {
+                AI::MXNet::NDArray->concatenate(
+                    [
+                        $_->[1]->slice([$self->cursor, -1]),
+                        $_->[1]->slice([0, $pad])
+                    ]
+                )
+            } @{ $data_source }
+        ];
+    }
+}
+
+method getdata()
+{
+    return $self->_getdata($self->data);
+}
+
+method getlabel()
+{
+    return $self->_getdata($self->label);
+}
+
+method getpad()
+{
+    if( $self->last_batch_handle eq 'pad'
+            and
+        ($self->cursor + $self->batch_size) > $self->num_data
+    )
+    {
+        return $self->cursor + $self->batch_size - $self->num_data;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+package AI::MXNet::MXDataIter;
+use Mouse;
+use AI::MXNet::Base;
+
+extends 'AI::MXNet::DataIter';
+
+=head1 NAME
+
+AI::MXNet::MXDataIter - A data iterator pre-built in C++ layer of MXNet.
+=cut
+
+has 'handle'           => (is => 'ro', isa => 'DataIterHandle', required => 1);
+has '_debug_skip_load' => (is => 'rw', isa => 'Int', default => 0);
+has '_debug_at_begin'  => (is => 'rw', isa => 'Int', default => 0);
+has 'data_name'        => (is => 'ro', isa => 'Str', default => 'data');
+has 'label_name'       => (is => 'ro', isa => 'Str', default => 'softmax_label');
+has [qw/first_batch
+        provide_data
+        provide_label
+        batch_size/]   => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->first_batch($self->next);
+    my $data = $self->first_batch->data->[0];
+    $self->provide_data([
+        AI::MXNet::DataDesc->new(
+            name  => $self->data_name,
+            shape => $data->shape,
+            dtype => $data->dtype
+        )
+    ]);
+    my $label = $self->first_batch->label->[0];
+    $self->provide_label([
+        AI::MXNet::DataDesc->new(
+            name  => $self->label_name,
+            shape => $label->shape,
+            dtype => $label->dtype
+        )
+    ]);
+    $self->batch_size($data->shape->[0]);
+}
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::DataIterFree(shift->handle));
+}
+
+=head2 debug_skip_load
+
+Set the iterator to simply return always first batch.
+Notes
+-----
+This can be used to test the speed of network without taking
+the loading delay into account.
+=cut
+
+method debug_skip_load()
+{
+    $self->_debug_skip_load(1);
+    AI::MXNet::Logging->info('Set debug_skip_load to be true, will simply return first batch');
+}
+
+method reset()
+{
+    $self->_debug_at_begin(1);
+    $self->first_batch(undef);
+    check_call(AI::MXNetCAPI::DataIterBeforeFirst($self->handle));
+}
+
+method next()
+{
+    if($self->_debug_skip_load and not $self->_debug_at_begin)
+    {
+        return  AI::MXNet::DataBatch->new(
+                    data  => [$self->getdata],
+                    label => [$self->getlabel],
+                    pad   => $self->getpad,
+                    index => $self->getindex
+        );
+    }
+    if(defined $self->first_batch)
+    {
+        my $batch = $self->first_batch;
+        $self->first_batch(undef);
+        return $batch
+    }
+    $self->_debug_at_begin(0);
+    my $next_res =  check_call(AI::MXNetCAPI::DataIterNext($self->handle));
+    if($next_res)
+    {
+        return  AI::MXNet::DataBatch->new(
+                    data  => [$self->getdata],
+                    label => [$self->getlabel],
+                    pad   => $self->getpad,
+                    index => $self->getindex
+        );
+    }
+    else
+    {
+        return undef;
+    }
+}
+
+method iter_next()
+{
+    if(defined $self->first_batch)
+    {
+        return 1;
+    }
+    else
+    {
+        return scalar(check_call(AI::MXNetCAPI::DataIterNext($self->handle)));
+    }
+}
+
+method getdata()
+{
+    my $handle = check_call(AI::MXNetCAPI::DataIterGetData($self->handle));
+    return AI::MXNet::NDArray->new(handle => $handle);
+}
+
+method getlabel()
+{
+    my $handle = check_call(AI::MXNetCAPI::DataIterGetLabel($self->handle));
+    return AI::MXNet::NDArray->new(handle => $handle);
+}
+
+method getindex()
+{
+    return pdl(check_call(AI::MXNetCAPI::DataIterGetIndex($self->handle)));
+}
+
+method getpad()
+{
+    return scalar(check_call(AI::MXNetCAPI::DataIterGetPadNum($self->handle)));
+}
+
+package AI::MXNet::IO;
+
+sub NDArrayIter { shift; return AI::MXNet::NDArrayIter->new(@_); }
+
+my %iter_meta;
+method get_iter_meta()
+{
+    return \%iter_meta;
+}
+
+# Create an io iterator by handle.
+func _make_io_iterator($handle)
+{
+    my ($iter_name, $desc,
+        $arg_names, $arg_types, $arg_descs
+    ) = @{ check_call(AI::MXNetCAPI::DataIterGetIterInfo($handle)) };
+    my $param_str = build_param_doc($arg_names, $arg_types, $arg_descs);
+    my $doc_str = "$desc\n\n"
+                  ."$param_str\n"
+                  ."name : string, required.\n"
+                  ."    Name of the resulting data iterator.\n\n"
+                  ."Returns\n"
+                  ."-------\n"
+                  ."iterator: DataIter\n"
+                  ."    The result iterator.";
+    my $iter = sub {
+        my $class = shift;
+        my (@args, %kwargs);
+        if(@_ and ref $_[-1] eq 'HASH')
+        {
+            %kwargs = %{ pop(@_) };
+        }
+        @args = @_;
+        Carp::confess("$iter_name can only accept keyword arguments")
+            if @args;
+        for my $key (keys %kwargs)
+        {
+            $kwargs{ $key } = "(" .join(",", @{ $kwargs{ $key } }) .")"
+                if ref $kwargs{ $key } eq 'ARRAY';
+        }
+        my $handle = check_call(
+            AI::MXNetCAPI::DataIterCreateIter(
+                $handle,
+                scalar(keys %kwargs),
+                \%kwargs
+            )
+        );
+        return AI::MXNet::MXDataIter->new(handle => $handle, %kwargs);
+    };
+    $iter_meta{$iter}{__name__} = $iter_name;
+    $iter_meta{$iter}{__doc__}  = $doc_str;
+    return $iter;
+}
+
+# List and add all the data iterators to current module.
+method _init_io_module()
+{
+    for my $creator (@{ check_call(AI::MXNetCAPI::ListDataIters()) })
+    {
+        my $data_iter = _make_io_iterator($creator);
+        {
+            my $name = $iter_meta{ $data_iter }{__name__};
+            no strict 'refs';
+            {
+                *{__PACKAGE__."::$name"} = $data_iter;
+            } 
+        }
+    }
+}
+
+# Initialize the io in startups
+__PACKAGE__->_init_io_module;
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
new file mode 100644
index 000000000000..e9b4cc1edf29
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
@@ -0,0 +1,883 @@
+package AI::MXNet::Image;
+use strict;
+use warnings;
+use Scalar::Util qw(blessed);
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet:Image - Read invidual image files and perform augmentations.
+=cut
+
+=head2 imdecode
+
+Decode an image from string. Requires OpenCV to work.
+
+Parameters
+----------
+$buf : str, array ref, pdl, ndarray
+    Binary image data.
+:$flag : int
+    0 for grayscale. 1 for colored.
+:$to_rgb : int
+    0 for BGR format (OpenCV default). 1 for RGB format (MXNet default).
+:$out : NDArray
+    Output buffer. Do not specify for automatic allocation.
+=cut
+
+method imdecode(Str|PDL $buf, Int :$flag=1, Int :$to_rgb=1, Maybe[AI::MXNet::NDArray] :$out=)
+{
+    if(not ref $buf)
+    {
+        my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{'uint8'});
+        my $len; { use bytes; $len = length $buf; }
+        my $pdl = PDL->new_from_specification($pdl_type, $len);
+        ${$pdl->get_dataref} = $buf;
+        $pdl->upd_data;
+        $buf = $pdl;
+    }
+    if(not (blessed $buf and $buf->isa('AI::MXNet::NDArray')))
+    {
+        $buf = AI::MXNet::NDArray->array($buf, dtype=>'uint8');
+    }
+    return AI::MXNet::NDArray->_cvimdecode($buf, { flag => $flag, to_rgb => $to_rgb, ($out ? (out => $out) : ()) });
+}
+
+=head2 scale_down
+
+Scale down crop size if it's bigger than image size
+
+Parameters:
+Shape $src_size
+Shape $size
+
+Returns:
+($w, $h)
+=cut
+
+method scale_down(Shape $src_size, Shape $size)
+{
+    my ($w, $h) = @{ $size };
+    my ($sw, $sh) = @{ $src_size };
+    if($sh < $h)
+    {
+        ($w, $h) = (($w*$sh)/$h, $sh);
+    }
+    if($sw < $w)
+    {
+        ($w, $h) = ($sw, ($h*$sw)/$w);
+    }
+    return (int($w), int($h));
+}
+
+=head2 resize_short
+
+Resize shorter edge to size
+
+Parameters:
+AI::MXNet::NDArray $src
+Int                $size
+Int                $interp=2
+
+Returns:
+AI::MXNet::NDArray $resized_image
+=cut
+
+method resize_short(AI::MXNet::NDArray $src, Int $size, Int $interp=2)
+{
+    my ($new_h, $new_w);
+    my ($h, $w) = @{ $src->shape };
+    if($h > $w)
+    {
+        ($new_h, $new_w) = ($size*$h/$w, $size);
+    }
+    else
+    {
+        ($new_h, $new_w) = ($size, $size*$w/$h);
+    }
+    return AI::MXNet::NDArray->_cvimresize($src, $new_w, $new_h, { interp=>$interp });
+}
+
+=head2 fixed_crop
+
+Crop src at fixed location, and (optionally) resize it to size
+
+Parameters:
+AI::MXNet::NDArray $src
+Int                $x0
+Int                $y0
+Int                $w
+Int                $h
+Maybe[Shape]       $size=
+Int                $interp=2
+
+Returns:
+AI::MXNet::NDArray $cropped_image
+=cut
+
+method fixed_crop(AI::MXNet::NDArray $src, Int $x0, Int $y0, Int $w, Int $h, Maybe[Shape] $size=, Int $interp=2)
+{
+    my $out = AI::MXNet::NDArray->crop($src, { begin=>[$y0, $x0, 0], end=>[$y0+$h, $x0+$w, $src->shape->[2]] });
+    if(defined $size and join(',', $w, $h) ne join(',', @{ $size }))
+    {
+        $out = AI::MXNet::NDArray->_cvimresize($out, @{ $size }, { interp=>$interp });
+    }
+    return $out;
+}
+
+=head2 random_crop
+
+Randomly crop src with size. Upsample result if src is smaller than size
+
+Parameters:
+AI::MXNet::NDArray $src
+Shape              $size=
+Int                $interp=2
+
+Returns:
+($cropped_image, [$x0, $y0, $new_w, $new_h])
+=cut
+
+method random_crop(AI::MXNet::NDArray $src, Shape $size, Int $interp=2)
+{
+    my ($h, $w) = @{ $src->shape };
+    my ($new_w, $new_h) = __PACKAGE__->scale_down([$w, $h], $size);
+
+    my $x0 = int(rand($w - $new_w + 1));
+    my $y0 = int(rand($h - $new_h + 1));
+
+    my $out = __PACKAGE__->fixed_crop($src, $x0, $y0, $new_w, $new_h, $size, $interp);
+    return ($out, [$x0, $y0, $new_w, $new_h]);
+}
+
+=head2 center_crop
+
+Randomly crop src with size around the center. Upsample result if src is smaller than size
+
+Parameters:
+AI::MXNet::NDArray $src
+Shape              $size=
+Int                $interp=2
+
+Returns:
+($cropped_image, [$x0, $y0, $new_w, $new_h])
+=cut
+
+method center_crop(AI::MXNet::NDArray $src, Shape $size, Int $interp=2)
+{
+    my ($h, $w) = @{ $src->shape };
+    my ($new_w, $new_h) = __PACKAGE__->scale_down([$w, $h], $size);
+
+    my $x0 = int(($w - $new_w)/2);
+    my $y0 = int(($h - $new_h)/2);
+
+    my $out = __PACKAGE__->fixed_crop($src, $x0, $y0, $new_w, $new_h, $size, $interp);
+    return ($out, [$x0, $y0, $new_w, $new_h]);
+}
+
+=head2 color_normalize
+
+Normalize src with mean and std
+
+Parameter:
+AI::MXNet::NDArray $src
+Num|AI::MXNet::NDArray $mean
+Maybe[Num|AI::MXNet::NDArray] $std=
+Int $interp=2
+
+Returns:
+AI::MXNet::NDArray $normalized_image
+=cut
+
+method color_normalize(AI::MXNet::NDArray $src, Num|AI::MXNet::NDArray $mean, Maybe[Num|AI::MXNet::NDArray] $std=)
+{
+    $src -= $mean;
+    if(defined $std)
+    {
+        $src /= $std;
+    }
+    return $src;
+}
+
+=head2 random_size_crop
+
+Randomly crop src with size. Randomize area and aspect ratio
+
+Parameters:
+AI::MXNet::NDArray $src
+Shape              $size
+Num                $min_area
+ArrayRef[Int]      [$from, $to] # $ratio
+Maybe[Int]         $interp=2
+
+Returns:
+($cropped_image, [$x0, $y0, $new_w, $new_h])
+=cut
+
+method random_size_crop(AI::MXNet::NDArray $src, Shape $size, Num $min_area, ArrayRef[Num] $ratio, Maybe[Int] $interp=2)
+{
+    my ($h, $w) = @{ $src->shape };
+    my ($from, $to) = @{ $ratio };
+    my $new_ratio = $from + ($to-$from) * rand;
+    my $max_area;
+    if($new_ratio * $h > $w)
+    {
+        $max_area = $w*int($w/$new_ratio);
+    }
+    else
+    {
+        $max_area = $h*int($h*$new_ratio);
+    }
+
+    $min_area *= $h*$w;
+    if($max_area < $min_area)
+    {
+        return __PACKAGE__->random_crop($src, $size, $interp);
+    }
+    my $new_area = $min_area + ($max_area-$min_area) * rand;
+    my $new_w = int(sqrt($new_area*$new_ratio));
+    my $new_h = $new_w;
+
+    assert($new_w <= $w and $new_h <= $h);
+    my $x0 = int(rand($w - $new_w + 1));
+    my $y0 = int(rand($h - $new_h + 1));
+
+    my $out = __PACKAGE__->fixed_crop($src, $x0, $y0, $new_w, $new_h, $size, $interp);
+    return ($out, [$x0, $y0, $new_w, $new_h]);
+}
+
+=head2 ResizeAug
+
+Makes "resize shorter edge to size augumenter" closure
+
+Parameters:
+Shape              $size
+Int                $interp=2
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [__PACKAGE__->resize_short($src, $size, $interp)]
+=cut
+
+method ResizeAug(Shape $size, Int $interp=2)
+{
+    my $aug = sub {
+        my $src = shift;
+        return [__PACKAGE__->resize_short($src, $size, $interp)];
+    };
+    return $aug;
+}
+
+=head2 RandomCropAug
+
+Makes "random crop augumenter" closure
+
+Parameters:
+Shape              $size
+Int                $interp=2
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [(__PACKAGE__->random_crop($src, $size, $interp))[0]]
+=cut
+
+method RandomCropAug(Shape $size, Int $interp=2)
+{
+    my $aug = sub {
+        my $src = shift;
+        return [(__PACKAGE__->random_crop($src, $size, $interp))[0]];
+    };
+    return $aug;
+}
+
+=head2 RandomSizedCropAug
+
+Makes "random crop augumenter" closure
+
+Parameters:
+Shape              $size
+Num                $min_area
+ArrayRef[Num]      $ratio
+Int                $interp=2
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [(__PACKAGE__->random_size_crop($src, $size, $min_area, $ratio, $interp))[0]]
+=cut
+
+method RandomSizedCropAug(Shape $size, Num $min_area, ArrayRef[Num] $ratio, Int $interp=2)
+{
+    my $aug = sub {
+        my $src = shift;
+        return [(__PACKAGE__->random_size_crop($src, $size, $min_area, $ratio, $interp))[0]];
+    };
+    return $aug;
+}
+
+=head2 CenterCropAug
+
+Makes "center crop augumenter" closure
+
+Parameters:
+Shape              $size
+Int                $interp=2
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [(__PACKAGE__->center_crop($src, $size, $interp))[0]]
+=cut
+
+method CenterCropAug(Shape $size, Int $interp=2)
+{
+    my $aug = sub {
+        my $src = shift;
+        return [(__PACKAGE__->center_crop($src, $size, $interp))[0]];
+    };
+    return $aug;
+}
+
+=head2 RandomOrderAug
+
+Makes "Apply list of augmenters in random order" closure
+
+Parameters:
+ArrayRef[CodeRef]  $ts
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns ArrayRef[AI::MXNet::NDArray]
+=cut
+
+method RandomOrderAug(ArrayRef[CodeRef] $ts)
+{
+    my $aug = sub {
+        my $src = shift;
+        my @ts = List::Util::shuffle(@{ $ts });
+        my @tmp;
+        for my $t (@ts)
+        {
+            push @tmp, &{$t}($src);
+        }
+        return \@tmp;
+    };
+    return $aug;
+}
+
+=head2 RandomOrderAug
+
+Makes "Apply random brightness, contrast and saturation jitter in random order" closure
+
+Parameters:
+Num $brightness
+Num $contrast
+Num $saturation
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns ArrayRef[AI::MXNet::NDArray]
+=cut
+
+method ColorJitterAug(Num $brightness, Num $contrast, Num $saturation)
+{
+    my @ts;
+    my $coef = AI::MXNet::NDArray->array([[[0.299, 0.587, 0.114]]]);
+    if($brightness > 0)
+    {
+        my $baug = sub { my $src = shift;
+            my $alpha = 1 + -$brightness + 2 * $brightness * rand;
+            $src *= $alpha;
+            return [$src];
+        };
+        push @ts, $baug;
+    }
+
+    if($contrast > 0)
+    {
+        my $caug = sub { my $src = shift;
+            my $alpha = 1 + -$contrast + 2 * $contrast * rand;
+            my $gray  = $src*$coef;
+            $gray = (3.0*(1.0-$alpha)/$gray->size)*$gray->sum;
+            $src *= $alpha;
+            $src += $gray;
+            return [$src];
+        };
+        push @ts, $caug;
+    }
+
+    if($saturation > 0)
+    {
+        my $saug = sub { my $src = shift;
+            my $alpha = 1 + -$saturation + 2 * $saturation * rand;
+            my $gray  = $src*$coef;
+            $gray = AI::MXNet::NDArray->sum($gray, { axis=>2, keepdims =>1 });
+            $gray *= (1.0-$alpha);
+            $src *= $alpha;
+            $src += $gray;
+            return [$src];
+        };
+        push @ts, $saug;
+    }
+
+    return __PACKAGE__->RandomOrderAug(\@ts);
+}
+
+=head2 LightingAug
+
+Makes "Add PCA based noise" closure
+
+Parameters:
+Num $alphastd
+PDL $eigval
+PDL $eigvec
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns ArrayRef[AI::MXNet::NDArray]
+=cut
+
+method LightingAug(Num $alphastd, PDL $eigval, PDL $eigvec)
+{
+    my $aug = sub { my $src = shift;
+        my $alpha = AI::MXNet::NDArray->zeros([3]);
+        AI::MXNet::Random->normal(0, $alphastd, { out => $alpha });
+        my $rgb = ($eigvec*$alpha->aspdl) x $eigval;
+        $src += AI::MXNet::NDArray->array($rgb);
+        return [$src]
+    };
+    return $aug
+}
+
+=head2 ColorNormalizeAug
+
+Makes "Mean and std normalization" closure
+
+Parameters:
+PDL $mean
+PDL $std
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [__PACKAGE__->color_normalize($src, $mean, $std)]
+=cut
+
+method ColorNormalizeAug(PDL $mean, PDL $std)
+{
+    $mean = AI::MXNet::NDArray->array($mean);
+    $std = AI::MXNet::NDArray->array($std);
+    my $aug = sub { my $src = shift;
+        return [__PACKAGE__->color_normalize($src, $mean, $std)]
+    };
+    return $aug;
+}
+
+=head2 HorizontalFlipAug
+
+Makes "Random horizontal flipping" closure
+
+Parameters:
+Num $p < 1
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [$p > rand ? AI::MXNet::NDArray->flip($src, axis=1>) : $src]
+=cut
+
+method HorizontalFlipAug(Num $p)
+{
+    my $aug = sub { my $src = shift;
+        return [$p > rand() ? AI::MXNet::NDArray->flip($src, { axis=>1 }) : $src]
+    };
+    return $aug;
+}
+
+=head2 CastAug
+
+Makes "Cast to float32" closure
+
+Parameters:
+Num $p < 1
+
+Returns:
+CodeRef that accepts AI::MXNet::NDArray $src as input
+and returns [$src->astype('float32')]
+=cut
+
+method CastAug()
+{
+    my $aug = sub { my $src = shift;
+        return [$src->astype('float32')]
+    };
+    return $aug;
+}
+
+=head2 CreateAugmenter
+
+Create augumenter list
+
+Parameters:
+Shape          :$data_shape,
+Bool           :$resize=0,
+Bool           :$rand_crop=0,
+Bool           :$rand_resize=0,
+Bool           :$rand_mirror=0,
+Maybe[Num|PDL] :$mean=,
+Maybe[Num|PDL] :$std=,
+Num            :$brightness=0,
+Num            :$contrast=0,
+Num            :$saturation=0,
+Num            :$pca_noise=0,
+Int            :$inter_method=2
+=cut
+
+method CreateAugmenter(
+Shape          :$data_shape,
+Bool           :$resize=0,
+Bool           :$rand_crop=0,
+Bool           :$rand_resize=0,
+Bool           :$rand_mirror=0,
+Maybe[Num|PDL] :$mean=,
+Maybe[Num|PDL] :$std=,
+Num            :$brightness=0,
+Num            :$contrast=0,
+Num            :$saturation=0,
+Num            :$pca_noise=0,
+Int            :$inter_method=2
+)
+{
+    my @auglist;
+    if($resize > 0)
+    {
+        push @auglist, __PACKAGE__->ResizeAug($resize, $inter_method);
+    }
+
+    my $crop_size = [$data_shape->[2], $data_shape->[1]];
+    if($rand_resize)
+    {
+        assert($rand_crop);
+        push @auglist, __PACKAGE__->RandomSizedCropAug($crop_size, 0.3, [3.0/4.0, 4.0/3.0], $inter_method);
+    }
+    elsif($rand_crop)
+    {
+        push @auglist, __PACKAGE__->RandomCropAug($crop_size, $inter_method);
+    }
+    else
+    {
+        push @auglist, __PACKAGE__->CenterCropAug($crop_size, $inter_method);
+    }
+
+    if($rand_mirror)
+    {
+        push @auglist, __PACKAGE__->HorizontalFlipAug(0.5);
+    }
+
+    push @auglist, __PACKAGE__->CastAug;
+
+    if($brightness or $contrast or $saturation)
+    {
+        push @auglist, __PACKAGE__->ColorJitterAug($brightness, $contrast, $saturation);
+    }
+    if($pca_noise > 0)
+    {
+        my $eigval = AI::MXNet::NDArray->array([55.46, 4.794, 1.148])->aspdl;
+        my $eigvec = AI::MXNet::NDArray->array([[-0.5675, 0.7192, 0.4009],
+                           [-0.5808, -0.0045, -0.8140],
+                           [-0.5836, -0.6948, 0.4203]])->aspdl;
+        push @auglist, __PACKAGE__->LightingAug($pca_noise, $eigval, $eigvec);
+    }
+
+    if($mean)
+    {
+        $mean = AI::MXNet::NDArray->array([123.68, 116.28, 103.53])->aspdl;
+    }
+    if($std)
+    {
+        $std = AI::MXNet::NDArray->array([58.395, 57.12, 57.375])->aspdl;
+    }
+    if(defined $mean)
+    {
+        assert(defined $std);
+        push @auglist, __PACKAGE__->ColorNormalizeAug($mean, $std);
+    }
+
+    return \@auglist;
+}
+
+method ImageIter(@args) { AI::MXNet::ImageIter->new(@args) }
+
+package AI::MXNet::ImageIter;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::DataIter';
+
+=head1 NAME
+
+AI::MXNet::ImageIter - Image data iterator
+=cut
+
+=head1 DESCRIPTION
+
+
+Image data iterator with a large number of augumentation choices.
+Supports reading from both .rec files and raw image files with image list.
+
+To load from .rec files, please specify path_imgrec. Also specify path_imgidx
+to use data partition (for distributed training) or shuffling.
+
+To load from raw image files, specify path_imglist and path_root.
+
+Parameters
+----------
+batch_size : Int
+    Number of examples per batch
+data_shape : Shape
+    Data shape in (channels, height, width).
+    For now, only RGB image with 3 channels is supported.
+label_width : Int
+    dimension of label
+path_imgrec : str
+    path to image record file (.rec).
+    Created with tools/im2rec.py or bin/im2rec
+path_imglist : str
+    path to image list (.lst)
+    Created with tools/im2rec.py or with custom script.
+    Format: index\t[one or more label separated by \t]\trelative_path_from_root
+imglist: array ref
+    a list of image with the label(s)
+    each item is a list [imagelabel: float or list of float, imgpath]
+path_root : str
+    Root folder of image files
+path_imgidx : str
+    Path to image index file. Needed for partition and shuffling when using .rec source.
+shuffle : bool
+    Whether to shuffle all images at the start of each iteration.
+    Can be slow for HDD.
+part_index : int
+    Partition index
+num_parts : int
+    Total number of partitions.
+kwargs : hash ref with any additional arguments for augmenters
+=cut
+
+has 'batch_size'  => (is => 'ro', isa => 'Int',   required => 1);
+has 'data_shape'  => (is => 'ro', isa => 'Shape', required => 1);
+has 'label_width' => (is => 'ro', isa => 'Int',   default  => 1);
+has [qw/path_imgrec
+        path_imglist
+        path_root
+        path_imgidx
+    /]            => (is => 'ro', isa => 'Str');
+has 'shuffle'     => (is => 'ro', isa => 'Bool', default => 0);
+has 'part_index'  => (is => 'ro', isa => 'Int', default => 0);
+has 'num_parts'   => (is => 'ro', isa => 'Int', default => 0);
+has 'aug_list'    => (is => 'rw', isa => 'ArrayRef[CodeRef]');
+has 'imglist'     => (is => 'rw', isa => 'ArrayRef|HashRef');
+has 'kwargs'      => (is => 'ro', isa => 'HashRef');
+has [qw/imgidx
+        imgrec
+        seq
+        cur
+        provide_data
+        provide_label
+           /]     => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    assert($self->path_imgrec or $self->path_imglist or ref $self->imglist eq 'ARRAY');
+    if($self->path_imgrec)
+    {
+        print("loading recordio...\n");
+        if($self->path_imgidx)
+        {
+            $self->imgrec(
+                AI::MXNet::IndexedRecordIO->new(
+                    idx_path => $self->path_imgidx,
+                    uri => $self->path_imgrec,
+                    flag => 'r'
+                )
+            );
+            $self->imgidx([@{ $self->imgrec->keys }]);
+        }
+        else
+        {
+            $self->imgrec(AI::MXNet::RecordIO->new(uri => $self->path_imgrec, flag => 'r'));
+        }
+    }
+    my %imglist;
+    my @imgkeys;
+    if($self->path_imglist)
+    {
+        print("loading image list...\n");
+        open(my $f, $self->path_imglist) or confess("can't open ${\ $self->path_imglist } : $!");
+        while(my $line = <$f>)
+        {
+            chomp($line);
+            my @line = split(/\t/, $line);
+            my $label = AI::MXNet::NDArray->array([@line[1..@line-1]]);
+            my $key   = $line[0];
+            $imglist{$key} = [$label, $line[-1]];
+            push @imgkeys, $key;
+        }
+        $self->imglist(\%imglist);
+    }
+    elsif(ref $self->imglist eq 'ARRAY')
+    {
+        print("loading image list...\n");
+        my %result;
+        my $index = 1;
+        for my $img (@{ $self->imglist })
+        {
+            my $key = $index++;
+            my $label;
+            if(not ref $img->[0])
+            {
+                $label = AI::MXNet::NDArray->array([$img->[0]]);
+            }
+            else
+            {
+                $label = AI::MXNet::NDArray->array($img->[0]);
+                $result{$key} = [$label, $img->[1]];
+                push @imgkeys, $key;
+            }
+        }
+        $self->imglist(\%result);
+    }
+    assert(@{ $self->data_shape } == 3 and $self->data_shape->[0] == 3);
+    $self->provide_data([
+        AI::MXNet::DataDesc->new(
+            name  => 'data',
+            shape => [$self->batch_size, @{ $self->data_shape }]
+        )
+    ]);
+    if($self->label_width > 1)
+    {
+        $self->provide_label([
+            AI::MXNet::DataDesc->new(
+                name  => 'softmax_label',
+                shape => [$self->batch_size, $self->label_width]
+            )
+        ]);
+    }
+    else
+    {
+        $self->provide_label([
+            AI::MXNet::DataDesc->new(
+                name  => 'softmax_label',
+                shape => [$self->batch_size]
+            )
+        ]);
+    }
+    if(not defined $self->imgrec)
+    {
+        $self->seq(\@imgkeys);
+    }
+    elsif($self->shuffle or $self->num_parts > 1)
+    {
+        assert(defined $self->imgidx);
+        $self->seq($self->imgidx);
+    }
+    if($self->num_parts > 1)
+    {
+        assert($self->part_index < $self->num_parts);
+        my $N = @{ $self->seq };
+        my $C = $N/$self->num_parts;
+        $self->seq([@{ $self->seq }[$self->part_index*$C..($self->part_index+1)*$C-1]]);
+    }
+    if(defined $self->aug_list or defined $self->kwargs)
+    {
+        $self->aug_list(AI::MXNet::Image->CreateAugmenter(data_shape => $self->data_shape, %{ $self->kwargs//{} }));
+    }
+    $self->cur(0);
+    $self->reset();
+}
+
+method reset()
+{
+    if($self->shuffle)
+    {
+        @{ $self->seq } = List::Util::shuffle(@{ $self->seq });
+    }
+    if(defined $self->imgrec)
+    {
+        $self->imgrec->reset;
+    }
+    $self->cur(0);
+}
+
+method next_sample()
+{
+    if(defined $self->seq)
+    {
+        return undef if($self->cur >= @{ $self->seq });
+        my $idx = $self->seq->[$self->cur];
+        $self->cur($self->cur + 1);
+        if(defined $self->imgrec)
+        {
+            my $s = $self->imgrec->read_idx($idx);
+            my ($header, $img) = AI::MXNet::RecordIO->unpack($s);
+            if(not defined $self->imglist)
+            {
+                return ($header->label, $img);
+            }
+            else
+            {
+                return ($self->imglist->{$idx}[0], $img);
+            }
+        }
+        else
+        {
+            my ($label, $fname) = $self->imglist->{$idx};
+            if(not defined $self->imgrec)
+            {
+                open(F, $self->path_root . "/$fname") or confess("can't open $fname $!");
+                my $img;
+                { local $/ = undef; $img = <F> };
+                close(F);
+                return ($label, $img);
+            }
+        }
+    }
+    else
+    {
+        my $s = $self->imgrec->read;
+        return undef if(not defined $s);
+        my ($header, $img) = AI::MXNet::RecordIO->unpack($s);
+        return ($header->label, $img)
+    }
+}
+
+method next()
+{
+    my $batch_size = $self->batch_size;
+    my ($c, $h, $w) = @{ $self->data_shape };
+    my $batch_data  = AI::MXNet::NDArray->empty([$batch_size, $c, $h, $w]);
+    my $batch_label = AI::MXNet::NDArray->empty(@{$self->provide_label->[0]}[1]);
+    my $i = 0;
+    while ($i < $batch_size)
+    {
+        my ($label, $s) = $self->next_sample;
+        last if not defined $label;
+        my $data = [AI::MXNet::Image->imdecode($s)];
+        if(@{ $data->[0]->shape } == 0)
+        {
+            AI::MXNet::Logging->debug('Invalid image, skipping.');
+            next;
+        }
+        for my $aug (@{ $self->aug_list })
+        {
+            $data = [map { @{ $aug->($_) } } @$data];
+        }
+        for my $d (@$data)
+        {
+            assert(($i < $batch_size), 'Batch size must be multiples of augmenter output length');
+            $batch_data->at($i)  .= AI::MXNet::NDArray->transpose($d, { axes=>[2, 0, 1] });
+            $batch_label->at($i) .= $label;
+            $i++;
+        }
+    }
+    return undef if not $i;
+    return AI::MXNet::DataBatch->new(data=>[$batch_data], label=>[$batch_label], pad => $batch_size-$i);
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
new file mode 100644
index 000000000000..8b8dc8d52be2
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
@@ -0,0 +1,759 @@
+package AI::MXNet::InitDesc;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::InitDesc - A container for the initialization pattern serialization.
+
+=head2 new
+
+Parameters
+---------
+name : str
+    name of variable
+attrs : hash ref of str to str
+    attributes of this variable taken from AI::MXNet::Symbol->attr_dict
+=cut
+has 'name'   => (is => 'ro', isa => 'Str', required => 1);
+has 'attrs'  => (is => 'rw', isa => 'HashRef[Str]', lazy => 1, default => sub { +{} });
+use overload '""' => sub { shift->name };
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(name => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+# Base class for Initializers
+package AI::MXNet::Initializer;
+use Mouse;
+use AI::MXNet::Base qw(:DEFAULT pzeros pceil);
+use AI::MXNet::NDArray;
+use JSON::PP;
+use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
+             '""'  => sub {
+                my $self = shift;
+                my ($name) = ref($self) =~ /::(\w+)$/;
+                encode_json(
+                    [lc $name,
+                        $self->kwargs//{ map { $_ => "".$self->$_ } $self->meta->get_attribute_list }
+                ]);
+             },
+             fallback => 1;
+has 'kwargs' => (is => 'rw', init_arg => undef, isa => 'HashRef');
+
+=head1 NAME
+
+AI::MXNet::Initializer - Base class for all Initializers
+
+=head2 register
+
+Register an initializer class to the AI::MXNet::Initializer factory
+=cut
+
+my %init_registry;
+method get_init_registry()
+{
+    return \%init_registry;
+}
+
+method register()
+{
+    my ($name) = $self =~ /::(\w+)$/;
+    my $orig_name = $name;
+    $name         = lc $name;
+    if(exists $init_registry{ $name })
+    {
+        my $existing = $init_registry{ $name };
+        warn(
+            "WARNING: New initializer $self.$name" 
+            ."is overriding existing initializer $existing.$name"
+        );
+    }
+    $init_registry{ $name } = $self;
+    {
+        no strict 'refs';
+        no warnings 'redefine';
+        *{"$orig_name"} = sub { shift; $self->new(@_) };
+        *InitDesc       = sub { shift; AI::MXNet::InitDesc->new(@_) };
+    }
+}
+
+=head2 init
+
+Parameters
+----------
+desc : AI::MXNet::InitDesc|str
+    a name of corresponding ndarray
+    or the object that describes the initializer
+
+arr : AI::MXNet::NDArray
+    an ndarray to be Initialized
+=cut
+method call(Str|AI::MXNet::InitDesc $desc, AI::MXNet::NDArray $arr)
+{
+    return $self->_legacy_init($desc, $arr) unless blessed $desc;
+    my $init = $desc->attrs->{ __init__ };
+    if($init)
+    {
+      my ($klass, $kwargs) = @{ decode_json($init) };
+      $self->get_init_registry->{ lc $klass }->new(%{ $kwargs })->_init_weight("$desc", $arr);
+    }
+    else
+    {
+        $desc = "$desc";
+        if($desc =~ /(weight|bias|gamma|beta)$/)
+        {
+            my $method = "_init_$1";
+            $self->$method($desc, $arr);
+        }
+        else
+        {
+            $self->_init_default($desc, $arr)
+        }
+    }
+}
+
+
+method _legacy_init(Str $name, AI::MXNet::NDArray $arr)
+{
+    warnings::warnif(
+        'deprecated',
+        'Calling initializer with init($str, $NDArray) has been deprecated.'.
+        'please use init(mx->init->InitDesc(...), NDArray) instead.'
+    );
+    if($name =~ /^upsampling/)
+    {
+        $self->_init_bilinear($name, $arr);
+    }
+    elsif($name =~ /^stn_loc/ and $name =~ /weight$/)
+    {
+        $self->_init_zero($name, $arr);
+    }
+    elsif($name =~ /^stn_loc/ and $name =~ /bias$/)
+    {
+        $self->_init_loc_bias($name, $arr);
+    }
+    elsif($name =~ /bias$/)
+    {
+        $self->_init_bias($name, $arr);
+    }
+    elsif($name =~ /gamma$/)
+    {
+        $self->_init_gamma($name, $arr);
+    }
+    elsif($name =~ /beta$/)
+    {
+        $self->_init_beta($name, $arr);
+    }
+    elsif($name =~ /weight$/)
+    {
+        $self->_init_weight($name, $arr);
+    }
+    elsif($name =~ /moving_mean$/)
+    {
+        $self->_init_zero($name, $arr);
+    }
+    elsif($name =~ /moving_var$/)
+    {
+        $self->_init_one($name, $arr);
+    }
+    elsif($name =~ /moving_inv_var$/)
+    {
+        $self->_init_zero($name, $arr);
+    }
+    elsif($name =~ /moving_avg$/)
+    {
+        $self->_init_zero($name, $arr);
+    }
+    else
+    {
+        $self->_init_default($name, $arr);
+    }
+}
+
+*slice = *call;
+
+method _init_bilinear($name, $arr)
+{
+    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' });
+    my $weight = pzeros(
+        PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' }),
+        $arr->size
+    );
+    my $shape = $arr->shape;
+    my $size = $arr->size;
+    my $f = pceil($shape->[3] / 2)->at(0);
+    my $c = (2 * $f - 1 - $f % 2) / (2 * $f);
+    for my $i (0..($size-1))
+    {
+        my $x = $i % $shape->[3];
+        my $y = ($i / $shape->[3]) % $shape->[2];
+        $weight->index($i) .= (1 - abs($x / $f - $c)) * (1 - abs($y / $f - $c));
+    }
+    $arr .= $weight->reshape(reverse @{ $shape });
+}
+
+method _init_loc_bias($name, $arr)
+{
+    confess("assert error shape[0] == 6")
+        unless $arr->shape->[0] == 6;
+    $arr .= [1.0, 0, 0, 0, 1.0, 0];
+}
+
+method _init_zero($name, $arr)
+{
+    $arr .= 0;
+}
+
+method _init_one($name, $arr)
+{
+    $arr .= 1;
+}
+
+method _init_bias($name, $arr)
+{
+    $arr .= 0;
+}
+
+method _init_gamma($name, $arr)
+{
+    $arr .= 1;
+}
+
+method _init_beta($name, $arr)
+{
+    $arr .= 0;
+}
+
+method _init_weight($name, $arr)
+{
+    confess("Virtual method, subclass must override it");
+}
+
+method _init_default($name, $arr)
+{
+    confess(
+        "Unknown initialization pattern for $name. "
+        .'Default initialization is now limited to '
+        .'"weight", "bias", "gamma" (1.0), and "beta" (0.0).'
+        .'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern'
+    );
+}
+
+=head1 NAME
+
+AI::MXNet::Load  - Initialize by loading a pretrained param from a hash ref
+=cut
+
+=head2 new
+
+Parameters
+----------
+param: HashRef[AI::MXNet::NDArray]
+default_init: Initializer
+    default initializer when a name is not found in the param hash ref.
+verbose: bool
+log the names when initializing.
+=cut
+
+package AI::MXNet::Load;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+
+has 'param'        => (is => "rw", isa => 'HashRef[AI::MXNet::NDArray]', required => 1);
+has 'default_init' => (is => "rw", isa => "AI::MXNet::Initializer");
+has 'verbose'      => (is => "rw", isa => "Int", default => 0);
+
+sub BUILD
+{
+    my $self = shift;
+    my $param = AI::MXNet::NDArray->load($self->param) unless ref $self->param;
+    my %self_param;
+    while(my ($name, $arr) = each %{ $self->param })
+    {
+        $name =~ s/^(?:arg|aux)://;
+        $self_param{ $name } = $arr;
+    }
+    $self->param(\%self_param);
+}
+
+method call(Str $name, AI::MXNet::NDArray $arr)
+{
+    if(exists $self->param->{ $name })
+    {
+        my $target_shape = join(',', @{ $arr->shape });
+        my $param_shape  = join(',', @{ $self->param->{ $name }->shape });
+        confess(
+            "Parameter $name cannot be initialized from loading. "
+            ."Shape mismatch, target $target_shape vs loaded $param_shape"
+        ) unless $target_shape eq $param_shape;
+        $arr .= $self->param->{ $name };
+        AI::MXNet::Log->info("Initialized $name by loading") if $self->verbose;
+    }
+    else
+    {
+        confess(
+            "Cannot Initialize $name. Not found in loaded param "
+            ."and no default Initializer is provided."
+        ) unless defined $self->default_init;
+        $self->default_init($name, $arr);
+        AI::MXNet::Log->info("Initialized $name by default") if $self->verbose;
+    }
+}
+
+*slice = *call;
+
+=head1 NAME
+
+AI::MXNet::Mixed - A container for multiple initializer patterns.
+=cut
+
+=head2 new
+
+patterns: array ref of str
+    array ref of regular expression patterns to match parameter names.
+initializers: array ref of AI::MXNet::Initializer objects.
+    array ref of Initializers corresponding to the patterns.
+=cut
+
+package AI::MXNet::Mixed;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+
+has "map"          => (is => "rw", init_arg => undef);
+has "patterns"     => (is => "ro", isa => 'ArrayRef[Str]');
+has "initializers" => (is => "ro", isa => 'ArrayRef[AI::MXnet::Initializer]');
+
+sub BUILD
+{
+    my $self = shift;
+    confess("patterns count != initializers count")
+        unless (@{ $self->patterns } == @{ $self->initializers });
+    my %map;
+    @map{ @{ $self->patterns } } = @{ $self->initializers };
+    $self->map(\%map);
+}
+
+method call(Str $name, AI::MXNet::NDArray $arr)
+{
+    for my $pattern (keys %{ $self->map })
+    {
+        if($name =~ /$pattern/)
+        {
+            &{$self->map->{$pattern}}($name, $arr);
+            return;
+        }
+    }
+    confess(
+        "Parameter name $name did not match any pattern. Consider"
+        ."add a \".*\" pattern at the and with default Initializer."
+    );
+}
+
+package AI::MXNet::Zero;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    $arr .= 0;
+}
+
+__PACKAGE__->register;
+
+package AI::MXNet::One;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    $arr .= 1;
+}
+
+__PACKAGE__->register;
+
+package AI::MXNet::Constant;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+has 'value' => (is => 'ro', isa => 'Num', required => 1);
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(value => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    $arr .= $self->value;
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::Uniform - Initialize the weight with uniform random values
+=cut
+
+=head1 DESCRIPTION
+
+Initialize the weight with uniform random values contained within of [-scale, scale]
+
+Parameters
+----------
+scale : float, optional
+    The scale of the uniform distribution.
+=cut
+
+package AI::MXNet::Uniform;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+has "scale" => (is => "ro", isa => "Num", default => 0.7);
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(scale => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    AI::MXNet::Random->uniform(-$self->scale, $self->scale, { out => $arr });
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::Normal - Initialize the weight with gaussian random values.
+=cut
+
+=head1 DESCRIPTION
+
+Initialize the weight with gaussian random values contained within of [0, sigma]
+
+Parameters
+----------
+sigma : float, optional
+    Standard deviation for the gaussian distribution.
+=cut
+
+package AI::MXNet::Normal;
+use Mouse;
+extends 'AI::MXNet::Initializer';
+has "sigma" => (is => "ro", isa => "Num", default => 0.01);
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(sigma => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    AI::MXNet::Random->normal(0, $self->sigma, { out => $arr });
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::Orthogonal - Intialize the weight as an Orthogonal matrix.
+=cut
+
+=head1 DESCRIPTION
+
+Intialize weight as Orthogonal matrix
+
+Parameters
+----------
+scale : float, optional
+    scaling factor of weight
+
+rand_type: string optional
+    use "uniform" or "normal" random number to initialize weight
+
+Reference
+---------
+Exact solutions to the nonlinear dynamics of learning in deep linear neural networks
+arXiv preprint arXiv:1312.6120 (2013).
+=cut
+
+package AI::MXNet::Orthogonal;
+use AI::MXNet::Base;
+use Mouse;
+use AI::MXNet::Types;
+extends 'AI::MXNet::Initializer';
+has "scale" => (is => "ro", isa => "Num", default => 1.414);
+has "rand_type" => (is => "ro", isa => enum([qw/uniform normal/]), default => 'uniform');
+
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    my @shape = @{ $arr->shape };
+    my $nout = $shape[0];
+    my $nin = AI::MXNet::NDArray->size([@shape[1..$#shape]]);
+    my $tmp = AI::MXNet::NDArray->zeros([$nout, $nin]);
+    if($self->rand_type eq 'uniform')
+    {
+        AI::MXNet::Random->uniform(-1, 1, { out => $tmp });
+    }
+    else
+    {
+        AI::MXNet::Random->normal(0, 1, { out => $tmp });
+    }
+    $tmp = $tmp->aspdl;
+    my ($u, $s, $v) = svd($tmp);
+    my $q;
+    if(join(',', @{ $u->shape->unpdl }) eq join(',', @{ $tmp->shape->unpdl }))
+    {
+        $q = $u;
+    }
+    else
+    {
+        $q = $v;
+    }
+    $q = $self->scale * $q->reshape(reverse(@shape));
+    $arr .= $q;
+}
+
+*slice = *call;
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::Xavier - Initialize the weight with Xavier or similar initialization scheme.
+=cut
+
+=head1 DESCRIPTION
+
+Parameters
+----------
+rnd_type: str, optional
+    Use gaussian or uniform.
+factor_type: str, optional
+    Use avg, in, or out.
+magnitude: float, optional
+    The scale of the random number range.
+=cut
+
+package AI::MXNet::Xavier;
+use Mouse;
+use AI::MXNet::Types;
+extends 'AI::MXNet::Initializer';
+has "magnitude"   => (is => "rw", isa => "Num", default => 3);
+has "rnd_type"    => (is => "ro", isa => enum([qw/uniform gaussian/]), default => 'uniform');
+has "factor_type" => (is => "ro", isa => enum([qw/avg in out/]), default => 'avg');
+
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    my @shape = @{ $arr->shape };
+    my $hw_scale = 1;
+    if(@shape > 2)
+    {
+        $hw_scale = AI::MXNet::NDArray->size([@shape[2..$#shape]]);
+    }
+    my ($fan_in, $fan_out) = ($shape[1] * $hw_scale, $shape[0] * $hw_scale);
+    my $factor;
+    if($self->factor_type eq "avg")
+    {
+        $factor = ($fan_in + $fan_out) / 2;
+    }
+    elsif($self->factor_type eq "in")
+    {
+        $factor = $fan_in;
+    }
+    else
+    {
+        $factor = $fan_out;
+    }
+    my $scale = sqrt($self->magnitude / $factor);
+    if($self->rnd_type eq "iniform")
+    {
+        AI::MXNet::Random->uniform(-$scale, $scale, { out => $arr });
+    }
+    else
+    {
+        AI::MXNet::Random->normal(0, $scale, { out => $arr });
+    }
+}
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::MSRAPrelu - Custom initialization scheme.
+=cut
+
+=head1 DESCRIPTION
+
+Initialize the weight with initialization scheme from
+Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification.
+
+Parameters
+----------
+factor_type: str, optional
+    Use avg, in, or out.
+slope: float, optional
+    initial slope of any PReLU (or similar) nonlinearities.
+=cut
+
+package AI::MXNet::MSRAPrelu;
+use Mouse;
+extends 'AI::MXNet::Xavier';
+
+has '+rnd_type'    => (default => "gaussian");
+has '+factor_type' => (default => "avg");
+has 'slope'        => (is => 'ro', isa => 'Num', default => 0.25);
+
+sub BUILD
+{
+    my $self = shift;
+    my $magnitude = 2 / (1 + $self->slope ** 2);
+    $self->magnitude($magnitude);
+    $self->kwargs({ slope => $self->slope, factor_type => $self->factor_type });
+}
+__PACKAGE__->register;
+
+package AI::MXNet::Bilinear;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Initializer';
+
+method _init_weight($name, $arr)
+{
+    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' });
+    my $weight = pzeros(
+        PDL::Type->new(DTYPE_MX_TO_PDL->{ 'float32' }),
+        $arr->size
+    );
+    my $shape = $arr->shape;
+    my $size = $arr->size;
+    my $f = pceil($shape->[3] / 2)->at(0);
+    my $c = (2 * $f - 1 - $f % 2) / (2 * $f);
+    for my $i (0..($size-1))
+    {
+        my $x = $i % $shape->[3];
+        my $y = ($i / $shape->[3]) % $shape->[2];
+        $weight->index($i) .= (1 - abs($x / $f - $c)) * (1 - abs($y / $f - $c));
+    }
+    $arr .= $weight->reshape(reverse @{ $shape });
+}
+
+__PACKAGE__->register;
+
+package AI::MXNet::LSTMBias;
+
+=head1 NAME
+
+AI::MXNet::LSTMBias - Custom initializer for LSTM cells.
+=cut
+
+=head1 DESCRIPTION
+
+Initializes all biases of an LSTMCell to 0.0 except for
+the forget gate's bias that is set to a custom value.
+
+Parameters
+----------
+forget_bias: float,a bias for the forget gate.
+Jozefowicz et al. 2015 recommends setting this to 1.0.
+=cut
+
+use Mouse;
+extends 'AI::MXNet::Initializer';
+has 'forget_bias' => (is => 'ro', isa => 'Num', required => 1);
+
+method _init_weight(Str $name, AI::MXNet::NDArray $arr)
+{
+    $arr .= 0;
+    # in the case of LSTMCell the forget gate is the second
+    # gate of the 4 LSTM gates, we modify the according values.
+    my $num_hidden = int($arr->shape->[0] / 4);
+    $arr->slice([$num_hidden, 2*$num_hidden-1]) .= $self->forget_bias;
+}
+
+__PACKAGE__->register;
+
+package AI::MXNet::FusedRNN;
+use Mouse;
+use JSON::PP;
+extends 'AI::MXNet::Initializer';
+
+=head1 NAME
+
+AI::MXNet::FusedRNN - Custom initializer for fused RNN cells.
+=cut
+
+=head1 DESCRIPTION
+
+Initializes parameters for fused rnn layer
+
+Parameters
+----------
+init : Initializer
+    intializer applied to unpacked weights.
+num_hidden : int
+    should be the same with arguments passed to FusedRNNCell.
+num_layers : int
+    should be the same with arguments passed to FusedRNNCell.
+mode : str
+    should be the same with arguments passed to FusedRNNCell.
+bidirectional : bool
+    should be the same with arguments passed to FusedRNNCell.
+forget_bias : float
+    should be the same with arguments passed to FusedRNNCell.
+=cut
+
+has 'init'          => (is => 'rw', isa => 'Str|AI::MXNet::Initializer', required => 1);
+has 'forget_bias'   => (is => 'ro', isa => 'Num', default => 1);
+has [qw/num_hidden
+       num_layers/] => (is => 'ro', isa => 'Int', required => 1);
+has 'mode'          => (is => 'ro', isa => 'Str', required => 1);
+has 'bidirectional' => (is => 'ro', isa => 'Bool', default => 0);
+
+sub BUILD
+{
+    my $self = shift;
+    if(not blessed $self->init)
+    {
+        my ($klass, $kwargs);
+        eval {
+            ($klass, $kwargs) = @{ decode_json($self->init) };
+        };
+        confess("FusedRNN failed to init $@") if $@;
+        $self->init($self->get_init_registry->{ lc $klass }->new(%$kwargs));
+    }
+}
+
+method _init_weight($name, $arr)
+{
+    my $cell = AI::MXNet::RNN::FusedCell->new(
+        num_hidden    => $self->num_hidden,
+        num_layers    => $self->num_layers,
+        mode          => $self->mode,
+        bidirectional => $self->bidirectional,
+        forget_bias   => $self->forget_bias,
+        prefix        => ''
+    );
+
+    my $args = $cell->unpack_weights({ parameters => $arr });
+    for my $name (keys %{ $args })
+    {
+        my $desc = AI::MXNet::InitDesc->new(name => $name);
+        # for lstm bias, we use a custom initializer
+        # which adds a bias to the forget gate
+        if($self->_mode eq 'lstm' and $name =~ /f_bias$/)
+        {
+            $args->{$name} .= $self->forget_bias;
+        }
+        else
+        {
+            &{$self->init}($desc, $args->{$name});
+        }
+    }
+
+    $arr .= $cell->pack_weights($args)->{parameters};
+}
+
+__PACKAGE__->register;
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
new file mode 100644
index 000000000000..462353b3e8c5
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
@@ -0,0 +1,474 @@
+package AI::MXNet::KVStore;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::NDArray;
+use AI::MXNet::Optimizer;
+use MIME::Base64;
+use Storable;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::KVStore - Key value store interface of MXNet.
+
+=head1 DESCRIPTION 
+
+Key value store interface of MXNet for parameter synchronization, over multiple devices.
+=cut
+
+has 'handle' => (is => 'ro', isa => 'KVStoreHandle', required => 1);
+has '_updater' => (is => 'rw',  isa => 'AI::MXNet::Updater');
+has '_updater_func' => (is => 'rw', isa => 'CodeRef');
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::KVStoreFree(shift->handle));
+}
+
+=head2  init
+
+Initialize a single or a sequence of key-value pairs into the store.
+For each key, one must init it before push and pull.
+Only worker 0's (rank == 0) data are used.
+This function returns after data have been initialized successfully
+
+Parameters
+----------
+key : int or an array ref of int
+    The keys.
+value : NDArray or an array ref of NDArray objects
+    The values.
+
+    Examples
+    --------
+    >>> # init a single key-value pair
+    >>> $shape = [2,3]
+    >>> $kv = mx->kv->create('local')
+    >>> $kv->init(3, mx->nd->ones($shape)*2)
+    >>> $a = mx->nd->zeros($shape)
+    >>> $kv->pull(3, out=>$a)
+    >>> print $a->aspdl
+    [[ 2  2  2]
+    [ 2  2  2]]
+
+    >>> # init a list of key-value pairs
+    >>> $keys = [5, 7, 9]
+    >>> $kv->init(keys, [map { mx->nd->ones($shape) } 0..@$keys-1])
+=cut
+
+method init(
+    Int|ArrayRef[Int] $key,
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $value
+)
+{
+    my ($keys, $vals) = _key_value($key, $value);
+    check_call(
+        AI::MXNetCAPI::KVStoreInit(
+            $self->handle, scalar(@{ $keys }), $keys, $vals
+        )
+    );
+}
+
+=head2  push
+
+Push a single or a sequence of key-value pairs into the store.
+Data consistency:
+1. this function returns after adding an operator to the engine.
+2. push is always called after all previous push and pull on the same
+key are finished
+3. there is no synchronization between workers. One can use _barrier()
+to sync all workers
+
+Parameters
+----------
+key : int or array ref of int
+value : NDArray or array ref of NDArray or array ref of array refs of NDArray
+priority : int, optional
+    The priority of the push operation.
+    The higher the priority, the faster this action is likely
+    to be executed before other push actions.
+
+    Examples
+    --------
+    >>> # push a single key-value pair
+    >>> $kv->push(3, mx->nd->ones($shape)*8)
+    >>> $kv->pull(3, out=>$a) # pull out the value
+    >>> print $a->aspdl()
+        [[ 8.  8.  8.]
+        [ 8.  8.  8.]]
+
+    >>> # aggregate the value and the push
+    >>> $gpus = [map { mx->gpu($_) } 0..3]
+    >>> $b = [map { mx->nd->ones($shape, ctx => $_) } @$gpus]
+    >>> $kv->push(3, $b)
+    >>> $kv->pull(3, out=>$a)
+    >>> print $a->aspdl
+        [[ 4.  4.  4.]
+        [ 4.  4.  4.]]
+
+    >>> # push a list of keys.
+    >>> # single device
+    >>> $kv->push($keys, [map { mx->nd->ones($shape) } 0..@$keys-1)
+    >>> $b = [map { mx->nd->zeros(shape) } 0..@$keys-1]
+    >>> $kv->pull($keys, out=>$b)
+    >>> print $b->[1]->aspdl
+        [[ 1.  1.  1.]
+        [ 1.  1.  1.]]
+
+    >>> # multiple devices:
+    >>> $b = [map { [map { mx->nd->ones($shape, ctx => $_) } @$gpus] } @$keys-1]
+    >>> $kv->push($keys, $b)
+    >>> $kv->pull($keys, out=>$b)
+    >>> print $b->[1][1]->aspdl()
+        [[ 4.  4.  4.]
+        [ 4.  4.  4.]]
+=cut
+
+method push(
+    Int|ArrayRef[Int] $key,
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $value,
+    Int :$priority=0
+)
+{
+    my ($keys, $vals) = _key_value($key, $value);
+    check_call(
+        AI::MXNetCAPI::KVStorePush(
+            $self->handle, scalar(@{ $keys }), $keys, $vals, $priority
+        )
+    );
+}
+
+=head2 pull
+
+Pull a single value or a sequence of values from the store.
+
+Data consistency:
+
+1. this function returns after adding an operator to the engine. But any
+further read on out will be blocked until it is finished.
+2. pull is always called after all previous push and pull on the same
+key are finished.
+3. It pulls the newest value from the store.
+
+Parameters
+----------
+key : int or array ref of int
+    Keys
+out: NDArray or array ref of NDArray or array ref of array refs of NDArray
+    According values
+
+priority : int, optional
+    The priority of the push operation.
+    The higher the priority, the faster this action is likely
+    to be executed before other push actions.
+
+    Examples
+    --------
+    >>> # pull a single key-value pair
+    >>> $a = mx->nd->zeros($shape)
+    >>> $kv->pull(3, out=>$a)
+    >>> print $a->aspdl
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+
+    >>> # pull into multiple devices
+    >>> $b = [map { mx->nd->ones($shape, $_) } @$gpus]
+    >>> $kv->pull(3, out=>$b)
+    >>> print $b->[1]->aspdl()
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+
+    >>> # pull a list of key-value pairs.
+    >>> # On single device
+    >>> $keys = [5, 7, 9]
+    >>> $b = [map { mx->nd->zeros($shape) } 0..@$keys-1]
+    >>> $kv->pull($keys, out=>$b)
+    >>> print $b->[1]->aspdl()
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+    >>> # On multiple devices
+    >>> $b = [map { [map { mx->nd->ones($shape, ctx => $_) } @$gpus ] } 0..@$keys-1]
+    >>> $kv->pull($keys, out=>$b)
+    >>> print $b->[1][1]->aspdl()
+        [[ 2.  2.  2.]
+        [ 2.  2.  2.]]
+=cut
+
+method pull(
+    Int|ArrayRef[Int] $key,
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] :$out,
+    Int :$priority=0
+)
+{
+    my ($keys, $vals) = _key_value($key, $out);
+    check_call(
+        AI::MXNetCAPI::KVStorePull(
+            $self->handle, scalar(@{ $keys }), $keys, $vals, $priority
+        )
+    );
+}
+
+=head2  set_optimizer
+
+Register an optimizer to the store
+
+If there are multiple machines, this process (should be a worker node)
+will pack this optimizer and send it to all servers. It returns after
+this action is done.
+
+Parameters
+----------
+optimizer : Optimizer
+    the optimizer
+=cut
+
+method set_optimizer(AI::MXNet::Optimizer $optimizer)
+{
+    my $is_worker = check_call(AI::MXNetCAPI::KVStoreIsWorkerNode());
+    if($self->type eq 'dist' and $is_worker)
+    {
+        my $optim_str = MIME::Base64::encode_base64(Storable::freeze($optimizer), "");
+        $self->_send_command_to_servers(0, $optim_str);
+    }
+    else
+    {
+        $self->_updater(AI::MXNet::Optimizer->get_updater($optimizer));
+        $self->_set_updater(sub { &{$self->_updater}(@_) });
+    }
+}
+
+=head2  type
+
+Get the type of this kvstore
+
+Returns
+-------
+type : str
+    the string type
+=cut
+
+method type()
+{
+    return scalar(check_call(AI::MXNetCAPI::KVStoreGetType($self->handle)));
+}
+
+=head2  rank
+
+Get the rank of this worker node
+
+Returns
+-------
+rank : int
+    The rank of this node, which is in [0, get_num_workers())
+=cut
+
+method rank()
+{
+    return scalar(check_call(AI::MXNetCAPI::KVStoreGetRank($self->handle)));
+}
+
+=head2  num_workers
+
+Get the number of worker nodes
+
+Returns
+-------
+size :int
+    The number of worker nodes
+=cut
+
+method num_workers()
+{
+    return scalar(check_call(AI::MXNetCAPI::KVStoreGetGroupSize($self->handle)));
+}
+
+=head2 save_optimizer_states
+
+Save optimizer (updater) state to file
+
+Parameters
+----------
+fname : str
+    Path to output states file.
+=cut
+
+method save_optimizer_states(Str $fname)
+{
+    confess("Cannot save states for distributed training")
+        unless defined $self->_updater;
+    open(F, ">:raw", "$fname") or confess("can't open $fname for writing: $!");
+    print F $self->_updater->get_states();
+    close(F);
+}
+
+=head2 load_optimizer_states
+
+Load optimizer (updater) state from file.
+
+Parameters
+----------
+fname : str
+    Path to input states file.
+=cut
+
+method load_optimizer_states(Str $fname)
+{
+    confess("Cannot save states for distributed training")
+        unless defined $self->_updater;
+    open(F, "<:raw", "$fname") or confess("can't open $fname for reading: $!");
+    my $data;
+    { local($/) = undef; $data = <F>; }
+    close(F);
+    $self->_updater->set_states($data);
+}
+
+=head2 _set_updater
+
+Set a push updater into the store.
+
+This function only changes the local store. Use set_optimizer for
+multi-machines.
+
+Parameters
+----------
+updater : function
+    the updater function
+
+    Examples
+    --------
+    >>> my $update = sub { my ($key, input, stored) = @_;
+        ...     print "update on key: $key\n";
+        ...     $stored += $input * 2; };
+        >>> $kv->_set_updater($update)
+        >>> $kv->pull(3, out=>$a)
+        >>> print $a->aspdl()
+        [[ 4.  4.  4.]
+        [ 4.  4.  4.]]
+        >>> $kv->push(3, mx->nd->ones($shape))
+        update on key: 3
+        >>> $kv->pull(3, out=>$a)
+        >>> print $a->aspdl()
+        [[ 6.  6.  6.]
+        [ 6.  6.  6.]]
+=cut
+
+method _set_updater(CodeRef $updater_func)
+{
+    $self->_updater_func(
+        sub {
+            my ($index, $input_handle, $storage_handle) = @_;
+            $updater_func->(
+                $index,
+                AI::MXNet::NDArray->new(handle => $input_handle),
+                AI::MXNet::NDArray->new(handle => $storage_handle)
+            );
+        }
+    );
+    check_call(
+        AI::MXNetCAPI::KVStoreSetUpdater(
+            $self->handle,
+            $self->_updater_func
+        )
+    );
+}
+
+=head2 _barrier
+
+Global barrier among all worker nodes
+
+For example, assume there are n machines, we want to let machine 0 first
+init the values, and then pull the inited value to all machines. Before
+pulling, we can place a barrier to guarantee that the initialization is
+finished.
+=cut
+
+method _barrier()
+{
+    check_call(AI::MXNetCAPI::KVStoreBarrier($self->handle));
+}
+
+=head2 _send_command_to_servers
+
+Send a command to all server nodes
+Send a command to all server nodes, which will make each server node run
+KVStoreServer.controller
+This function returns after the command has been executed in all server
+nodes
+
+Parameters
+----------
+head : int
+    the head of the command
+body : str
+    the body of the command
+=cut
+
+method _send_command_to_servers(Int $head, Str $body)
+{
+    check_call(
+        AI::MXNetCAPI::KVStoreSendCommmandToServers(
+            $self->handle,
+            $head,
+            $body
+        )
+    );
+}
+
+=head2 create
+
+Create a new KVStore.
+
+Parameters
+----------
+name : {'local'}
+The type of KVStore
+    - local works for multiple devices on a single machine (single process)
+    - dist works for multi-machines (multiple processes)
+Returns
+-------
+kv : KVStore
+    The created AI::MXNet::KVStore
+=cut
+
+method create(Str $name='local')
+{
+    my $handle = check_call(AI::MXNetCAPI::KVStoreCreate($name));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+sub _key_value
+{
+    my ($keys, $vals) = @_;
+    if(not ref $keys)
+    {
+        if(blessed $vals)
+        {
+            return ([$keys], [$vals->handle]);
+        }
+        else
+        {
+            for my $value (@{ $vals })
+            {
+                assert(blessed($value) and $value->isa('AI::MXNet::NDArray'));
+                return ([($keys)x@$vals], [map { $_->handle } @$vals]);
+            }
+        }
+    }
+    else
+    {
+        assert(not blessed($vals) and @$keys == @$vals);
+        my @c_keys;
+        my @c_vals;
+        zip(sub {
+            my ($key, $val) = @_;
+            my ($c_key, $c_val) = _key_value($key, $val);
+            push @c_keys, @$c_key;
+            push @c_vals, @$c_val;
+        }, $keys, $vals);
+        return (\@c_keys, \@c_vals);
+    }
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
new file mode 100644
index 000000000000..1c2086ad9131
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/KVStoreServer.pm
@@ -0,0 +1,81 @@
+package AI::MXNet::KVStoreServer;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::KVStore;
+use Storable;
+use MIME::Base64;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::KVStoreServer - The key-value store server
+=cut
+
+=head2 new
+
+Initialize a new KVStoreServer.
+
+Parameters
+    ----------
+kvstore : KVStore
+=cut
+
+has 'kvstore' => (is => 'ro', isa => 'AI::MXNet::KVStore', required => 1);
+has 'handle'  => (is => 'ro', isa => 'KVStoreHandle', default => sub { shift->kvstore->handle }, lazy => 1);
+has 'init_logging' => (is => 'rw', isa => 'Int', default => 0);
+
+
+# return the server controller
+method _controller()
+{
+    return  sub { 
+        my ($cmd_id, $cmd_body) = @_;
+        if (not $self->init_logging)
+        {
+            ## TODO write logging
+            $self->init_logging(1);
+        }
+        if($cmd_id == 0)
+        {
+            my $optimizer = Storable::thaw(MIME::Base64::decode_base64($cmd_body));
+            $self->kvstore->set_optimizer($optimizer);
+        }
+        else
+        {
+            my $rank = $self->kvstore->rank;
+            print("server $rank, unknown command ($cmd_id, $cmd_body)\n");
+        }
+    }
+}
+
+=head2 run
+
+run the server, whose behavior is like
+>>> while receive(x):
+...     if is_command x: controller(x)
+...     else if is_key_value x: updater(x)
+=cut
+
+method run()
+{
+    check_call(AI::MXNetCAPI::KVStoreRunServer($self->handle, $self->_controller));
+}
+
+# Start server/scheduler
+func _init_kvstore_server_module()
+{
+    my $is_worker = check_call(AI::MXNetCAPI::KVStoreIsWorkerNode());
+    if($is_worker == 0)
+    {
+        my $kvstore = AI::MXNet::KVStore->create('dist');
+        my $server = __PACKAGE__->new(kvstore => $kvstore);
+        $server->run();
+        exit(0);
+    }
+}
+
+_init_kvstore_server_module();
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
new file mode 100644
index 000000000000..4aeac60bab00
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/LRScheduler.pm
@@ -0,0 +1,175 @@
+package AI::MXNet::LRScheduler;
+use strict;
+use warnings;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Logging;
+use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } };
+
+=head1 NAME
+
+AI::MXNet::LRScheduler - The adaptive scheduler of the learning rate.
+=cut
+
+=head1 DESCRIPTION
+
+Learning rate scheduler, which adaptively changes the learning rate based on the
+progress.
+=cut
+
+=head2 new
+
+base_lr : float (optional, default 0.01)
+the initial learning rate
+=cut
+
+has 'base_lr' => (is => 'rw', isa => 'Num', default => 0.01);
+
+=head2 call
+
+Call to schedule current learning rate
+
+The training progress is presented by num_update, which can be roughly
+viewed as the number of minibatches executed so far. Its value is
+non-decreasing, and increases at most by one.
+
+The exact value is the upper bound of the number of updates applied to
+a weight/index
+
+See more details in https://github.com/dmlc/mxnet/issues/625
+
+Parameters
+----------
+num_update: int
+    the maximal number of updates applied to a weight.
+=cut
+
+package AI::MXNet::FactorScheduler;
+
+=head1 NAME
+
+AI::MXNet::FactorScheduler - Reduces the learning rate by a factor.
+
+=head1 DESCRIPTION
+
+Reduces the learning rate by a factor each step.
+Assume the weight has been updated by n times, then the learning rate will
+be base_lr * factor^(floor(n/step))
+
+Parameters
+----------
+step: int
+    schedule the learning rate update after n updates
+factor: float
+    the factor by which to reduce the learning rate.
+=cut
+use Mouse;
+extends 'AI::MXNet::LRScheduler';
+
+has 'step'            => (is => 'ro', isa => 'Int', required => 1);
+has 'factor'          => (is => 'ro', isa => 'Int', default  => 1);
+has 'count'           => (is => 'rw', isa => 'Int', default  => 1);
+has 'stop_factor_lr'  => (is => 'ro', isa => 'Num', default  => 1e-8);
+
+sub BUILD
+{
+    my $self = shift;
+    confess("Schedule step must be greater or equal than 1")
+        if $self->step < 1;
+    confess("Factor must be no more than 1 to make lr reduce")
+        if $self->factor > 1;
+}
+
+method call(Int $num_update)
+{
+    # NOTE: use while rather than if  (for continuing training via load_epoch)
+    while($num_update > $self->count + $self->step)
+    {
+        $self->count($self->count + $self->step);
+        $self->base_lr($self->base_lr * $self->factor);
+        if($self->base_lr < $self->stop_factor_lr)
+        {
+            $self->base_lr($self->stop_factor_lr);
+            AI::MXNet::Logging->info(
+                "Update[%d]: now learning rate arrived at %0.5e, will not "
+                ."change in the future", $num_update, $self->base_lr
+            );
+        }
+        else
+        {
+            AI::MXNet::Logging->info(
+                "Update[%d]: Change learning rate to %0.5e",
+                $num_update, $self->base_lr
+            );
+        }
+    }
+    return $self->base_lr;
+}
+
+package AI::MXNet::MultiFactorScheduler;
+
+=head1 NAME
+
+AI::MXNet::MultiFactorScheduler - Reduces the learning rate by an array ref of factors.
+
+=head1 DESCRIPTION
+
+Reduce learning rate in factor at steps specified in an array ref.
+Assume the weight has been updated by n times, then the learning rate will
+be base_lr * factor^(sum((step/n)<=1)) # step is an array
+
+Parameters
+----------
+step: array ref of int
+    schedule learning rate after n updates
+factor: float
+    the factor for reducing the learning rate
+=cut
+
+use Mouse;
+extends 'AI::MXNet::LRScheduler';
+has 'step'            => (is => 'ro', isa => 'ArrayRef[Int]', required => 1);
+has 'factor'          => (is => 'ro', isa => 'Int', default  => 1);
+has 'cur_step_ind '   => (is => 'ro', isa => 'Int', default  => 0);
+has 'count'           => (is => 'rw', isa => 'Int', default  => 0);
+
+sub BUILD
+{
+    my $self = shift;
+    confess("step array must have at least one member")
+        unless @{ $self->step } >=1 ;
+    for (my $i = 0; $i < @{ $self->step }; $i++)
+    {
+        confess("Schedule step must be an increasing integer list")
+            if($i and $self->step->[$i] <= $self->step->[$i-1]);
+        confess("Schedule step must be greater or equal than 1")
+            if $self->step->[$i] < 1;
+    }
+    confess("Factor must be no more than 1 to make lr reduce")
+        if $self->factor > 1;
+}
+
+method call(Int $num_update)
+{
+    # NOTE: use while rather than if  (for continuing training via load_epoch)
+    while($self->cur_step_ind < @{ $self->step })
+    {
+        if($num_update > $self->step->[$self->cur_step_ind])
+        {
+            $self->count($self->step->[$self->cur_step_ind]);
+            $self->cur_step_ind($self->cur_step_ind + 1);
+            $self->base_lr($self->base_lr * $self->factor);
+            AI::MXNet::Logging->info(
+                "Update[%d]: Change learning rate to %0.5e",
+                $num_update, $self->base_lr
+            );
+        }
+        else
+        {
+            return $self->base_lr;
+        }
+    }
+    return $self->base_lr;
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
new file mode 100644
index 000000000000..d1b7acf89862
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
@@ -0,0 +1,8 @@
+package AI::MXNet::Logging;
+## TODO
+use Mouse;
+sub warning { shift; warn sprintf(shift, @_) };
+*debug   = *info = *warning;
+sub get_logger { __PACKAGE__->new }
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
new file mode 100644
index 000000000000..3c3c3431404b
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
@@ -0,0 +1,607 @@
+package AI::MXNet::Metric;
+use strict;
+use warnings;
+use AI::MXNet::Function::Parameters;
+use Scalar::Util qw/blessed/;
+
+=head1 NAME
+
+AI::MXNet::Metric - Online evaluation metric module.
+=cut
+
+# Check to see if the two arrays are the same size.
+sub _calculate_shape
+{
+    my $input = shift;
+    my ($shape);
+    if(blessed($input))
+    {
+        if($input->isa('PDL'))
+        {
+            $shape = $input->shape->at(-1);
+        }
+        else
+        {
+            $shape = $input->shape->[0];
+        }
+    }
+    else
+    {
+        $shape = @{ $input };
+    }
+    return $shape;
+}
+func check_label_shapes(
+    ArrayRef|AI::MXNet::NDArray|PDL $labels,
+    ArrayRef|AI::MXNet::NDArray|PDL $preds
+)
+{
+    my ($label_shape, $pred_shape) = (_calculate_shape($labels), _calculate_shape($preds));
+    Carp::confess(
+        "Shape of labels $label_shape does not "
+        ."match shape of predictions $pred_shape"
+    ) unless $pred_shape == $label_shape;
+}
+
+=head1 DESCRIPTION
+
+    Base class of all evaluation metrics.
+=cut
+
+package AI::MXNet::EvalMetric;
+use Mouse;
+use overload '""' => sub {
+    return "EvalMetric: "
+            .Data::Dumper->new(
+                [shift->get_name_value()]
+            )->Purity(1)->Deepcopy(1)->Terse(1)->Dump
+},  fallback => 1;
+has 'name'       => (is => 'rw', isa => 'Str');
+has 'num'        => (is => 'rw', isa => 'Int');
+has 'num_inst'   => (is => 'rw', isa => 'Maybe[Int|ArrayRef[Int]]');
+has 'sum_metric' => (is => 'rw', isa => 'Maybe[Num|ArrayRef[Num]]');
+
+sub BUILD
+{
+    shift->reset;
+}
+
+method update($label, $pred)
+{
+    confess('NotImplemented');
+}
+
+method reset()
+{
+    if(not defined $self->num)
+    {
+        $self->num_inst(0);
+        $self->sum_metric(0);
+    }
+    else
+    {
+        $self->num_inst([(0) x $self->num]);
+        $self->sum_metric([(0) x $self->num]);
+    }
+}
+
+method get()
+{
+    if(not defined $self->num)
+    {
+        if($self->num_inst == 0)
+        {
+            return ($self->name, 'nan');
+        }
+        else
+        {
+            return ($self->name, $self->sum_metric / $self->num_inst);
+        }
+    }
+    else
+    {
+        my $names = [map { sprintf('%s_%d', $self->name, $_) } 0..$self->num-1];
+        my $values = [];
+        for (my $i = 0; $i < @{ $self->sum_metric }; $i++)
+        {
+            my ($x, $y) = ($self->sum_metric->[$i], $self->num_inst->[$i]);
+            if($y != 0)
+            {
+                push (@$values, $x/$y);
+            }
+            else
+            {
+                push (@$values, 'nan');
+            }
+        }
+        return ($names, $values);
+    }
+}
+
+method get_name_value()
+{
+    my ($name, $value) = $self->get;
+    $name = [$name] unless ref $name;
+    $value = [$value] unless ref $value;
+    my %ret;
+    @ret{ @$name } = @$value;
+    return \%ret;
+}
+
+package AI::MXNet::CompositeEvalMetric;
+use Mouse;
+
+extends 'AI::MXNet::EvalMetric';
+has 'metrics' => (is => 'rw', isa => 'ArrayRef[AI::MXNet::EvalMetric]', default => sub { [] });
+has '+name'   => (default => 'composite');
+
+# Add a child metric.
+method add(AI::MXNet::EvalMetric $metric)
+{
+    push @{ $self->metrics }, $metric;
+}
+
+# Get a child metric.
+method get_metric(int $index)
+{
+    my $max = @{ $self->metrics } - 1;
+    confess("Metric index $index is out of range 0 and $max")
+        if $index > $max;
+    return $self->metrics->[$index];
+}
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    for my $metric (@{ $self->metrics })
+    {
+        $metric->update($labels, $preds);
+    }
+}
+
+method reset()
+{
+    for my $metric (@{ $self->metrics })
+    {
+        $metric->reset;
+    }
+}
+
+method get()
+{
+    my $names = [];
+    my $results = [];
+    for my $metric (@{ $self->metrics })
+    {
+        my ($name, $result) = $metric->get;
+        $name = [$name] unless ref $name;
+        $result = [$result] unless ref $result;
+        push @$names, @$name;
+        push @$results, @$result;
+    }
+    return ($names, $results);
+}
+
+
+########################
+# CLASSIFICATION METRICS
+########################
+
+package AI::MXNet::Accuracy;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'accuracy');
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred_label) = @_;
+        if(join(',', @{$pred_label->shape}) ne join(',', @{$label->shape}))
+        {
+            $pred_label = AI::MXNet::NDArray->argmax_channel($pred_label);
+        }
+        AI::MXNet::Metric::check_label_shapes($label, $pred_label);
+        my $sum = ($pred_label->aspdl->flat == $label->aspdl->flat)->sum;
+        $self->sum_metric($self->sum_metric + $sum);
+        $self->num_inst($self->num_inst + $pred_label->size);
+    }, $labels, $preds);
+}
+
+package AI::MXNet::TopKAccuracy;
+use Mouse;
+use List::Util qw/min/;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'top_k_accuracy');
+has 'top_k' => (is => 'rw', isa => 'int', default => 1);
+
+sub BUILD
+{
+    my $self = shift;
+    confess("Please use Accuracy if top_k is no more than 1")
+        unless $self->top_k > 1;
+    $self->name($self->name . "_" . $self->top_k);
+}
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred_label) = @_;
+        confess('Predictions should be no more than 2 dims')
+            unless @{ $pred_label->shape } <= 2;
+        $pred_label = $pred_label->aspdl->qsorti;
+        $label = $label->astype('int32')->aspdl;
+        AI::MXNet::Metric::check_label_shapes($label, $pred_label);
+        my $num_samples = $pred_label->shape->at(-1);
+        my $num_dims = $pred_label->ndims;
+        if($num_dims == 1)
+        {
+            my $sum = ($pred_label->flat == $label->flat)->sum;
+            $self->sum_metric($self->sum_metric + $sum);
+        }
+        elsif($num_dims == 2)
+        {
+            my $num_classes = $pred_label->shape->at(0);
+            my $top_k = min($num_classes, $self->top_k);
+            for my $j (0..$top_k-1)
+            {
+                my $sum = ($pred_label->slice($num_classes -1 - $j, 'X')->flat == $label->flat)->sum;
+                $self->sum_metric($self->sum_metric + $sum);
+            }
+        }
+        $self->num_inst($self->num_inst + $num_samples);
+    }, $labels, $preds);
+}
+
+# Calculate the F1 score of a binary classification problem.
+package AI::MXNet::F1;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'f1');
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred_label) = @_;
+        AI::MXNet::Metric::check_label_shapes($label, $pred_label);
+        $pred_label = $pred_label->aspdl->maximum_ind;
+        $label = $label->astype('int32')->aspdl;
+        confess("F1 currently only supports binary classification.")
+            if $label->uniq->shape->at(0) > 2;
+        my ($true_positives, $false_positives, $false_negatives) = (0,0,0);
+        zip(sub{
+            my ($y_pred, $y_true) = @_;
+            if($y_pred == 1 and $y_true == 1)
+            {
+                $true_positives += 1;
+            }
+            elsif($y_pred == 1 and $y_true == 0)
+            {
+                $false_positives += 1;
+            }
+            elsif($y_pred == 0 and $y_true == 1)
+            {
+                $false_negatives += 1;
+            }
+        }, $pred_label->unpdl, $label->unpdl);
+        my $precision;
+        my $recall;
+        if($true_positives + $false_positives > 0)
+        {
+            $precision = $true_positives / ($true_positives + $false_positives);
+        }
+        else
+        {
+            $precision = 0;
+        }
+        if($true_positives + $false_negatives > 0)
+        {
+            $recall = $true_positives / ($true_positives +  $false_negatives);
+        }
+        else
+        {
+            $recall = 0;
+        }
+        my $f1_score;
+        if($precision + $recall > 0)
+        {
+            $f1_score = 2 * $precision * $recall / ($precision + $recall);
+        }
+        else
+        {
+            $f1_score = 0;
+        }
+        $self->sum_metric($self->sum_metric + $f1_score);
+        $self->num_inst($self->num_inst + 1);
+    }, $labels, $preds);
+}
+
+package AI::MXNet::Perplexity;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'        => (default => 'Perplexity');
+has 'ignore_label' => (is => 'ro', isa => 'Maybe[Int]');
+has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(ignore_label => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+=head1 NAME
+
+AI::MXNet::Perplexity
+=cut
+
+=head1 DESCRIPTION
+
+Calculate perplexity.
+
+Parameters
+----------
+ignore_label : int or undef
+    index of invalid label to ignore when
+    counting. usually should be -1. Include
+    all entries if undef.
+axis : int (default -1)
+    The axis from prediction that was used to
+    compute softmax. By default use the last
+    axis.
+=cut
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    my ($loss, $num, $probs) = (0, 0, []);
+    zip(sub {
+        my ($label, $pred) = @_;
+        my $label_shape = $label->shape;
+        my $pred_shape  = $pred->shape;
+        assert(
+            (product(@{ $label_shape }) == product(@{ $pred_shape })/$pred_shape->[-1]),
+            "shape mismatch: (@$label_shape) vs. (@$pred_shape)"
+        );
+        $label = $label->as_in_context($pred->context)->astype('int32')->reshape([$label->size]);
+        $pred = AI::MXNet::NDArray->pick($pred, $label, { axis => $self->axis });
+        push @{ $probs }, $pred;
+    }, $labels, $preds);
+
+    zip(sub {
+        my ($label, $prob) = @_;
+        $prob = $prob->aspdl;
+        if(defined $self->ignore_label)
+        {
+            my $ignore = $label->aspdl->flat == $self->ignore_label;
+            $prob = $prob*(1-$ignore) + $ignore;
+            $num += $prob->nelem - $ignore->sum;
+        }
+        else
+        {
+            $num += $prob->nelem;
+        }
+        $prob->where($prob < 1e-10) .= 1e-10;
+        $loss += -$prob->log->sum;
+    }, $labels, $probs);
+    $self->sum_metric($self->sum_metric + exp($loss/$num));
+    $self->num_inst($self->num_inst + 1);
+}
+
+####################
+# REGRESSION METRICS
+####################
+
+# Calculate Mean Absolute Error loss
+package AI::MXNet::MAE;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'mae');
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred) = @_;
+        $label = $label->aspdl;
+        $pred =  $pred->aspdl;
+        if($label->ndims == 1)
+        {
+            $label = $label->reshape(1, $label->shape->at(0));
+        }
+        $self->sum_metric($self->sum_metric + ($label - $pred)->abs->avg);
+        $self->num_inst($self->num_inst + 1);
+    }, $labels, $preds);
+}
+
+# Calculate Mean Squared Error loss
+package AI::MXNet::MSE;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'mse');
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred) = @_;
+        $label = $label->aspdl;
+        $pred =  $pred->aspdl;
+        if($label->ndims == 1)
+        {
+            $label = $label->reshape(1, $label->shape->at(0));
+        }
+        $self->sum_metric($self->sum_metric + (($label - $pred)**2)->avg);
+        $self->num_inst($self->num_inst + 1);
+    }, $labels, $preds);
+}
+
+# Calculate Root Mean Squred Error loss
+package AI::MXNet::RMSE;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'rmse');
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred) = @_;
+        $label = $label->aspdl;
+        $pred =  $pred->aspdl;
+        if($label->ndims == 1)
+        {
+            $label = $label->reshape(1, $label->shape->at(0));
+        }
+        $self->sum_metric($self->sum_metric + sqrt((($label - $pred)**2)->avg));
+        $self->num_inst($self->num_inst + 1);
+    }, $labels, $preds);
+}
+
+# Calculate Cross Entropy loss
+package AI::MXNet::CrossEntropy;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'cross-entropy');
+has 'eps'     => (is => 'ro', isa => 'Num', default => 1e-8);
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(eps => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds);
+    zip(sub {
+        my ($label, $pred) = @_;
+        $label = $label->aspdl->flat;
+        $pred =  $pred->aspdl;
+        my $label_shape = $label->shape->at(0);
+        my $pred_shape  = $pred->shape->at(-1);
+        confess(
+            "Size of label  $label_shape and 
+            .first dimension of pred $pred_shape do not match"
+        ) unless $label_shape == $pred_shape;
+        my $prob = $pred->index($label);
+        $self->sum_metric($self->sum_metric + (-($prob + $self->eps)->log)->sum);
+        $self->num_inst($self->num_inst + $label_shape);
+    }, $labels, $preds);
+}
+
+=head1 DESCRIPTION
+
+Custom evaluation metric that takes a sub ref.
+
+Parameters
+----------
+eval_function : subref
+    Customized evaluation function.
+name : str, optional
+    The name of the metric
+allow_extra_outputs : bool
+    If true, the prediction outputs can have extra outputs.
+    This is useful in RNN, where the states are also produced
+    in outputs for forwarding.
+=cut
+
+
+package AI::MXNet::CustomMetric;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has 'eval_function'       => (is => 'ro', isa => 'CodeRef');
+has 'allow_extra_outputs' => (is => 'ro', isa => 'Int', default => 0);
+
+method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    AI::MXNet::Metric::check_label_shapes($labels, $preds)
+        unless $self->allow_extra_outputs;
+    zip(sub {
+        my ($label, $pred) = @_;
+        $label = $label->aspdl;
+        $pred =  $pred->aspdl;
+        my $value = $self->eval_function->($label, $pred);
+        my $sum_metric = ref $value ? $value->[0] : $value;
+        my $num_inst   = ref $value ? $value->[1] : 1;
+        $self->sum_metric($self->sum_metric + $sum_metric);
+        $self->num_inst($self->num_inst + $num_inst);
+    }, $labels, $preds);
+}
+
+package AI::MXNet::Metric;
+
+=head2 create
+
+Create an evaluation metric.
+
+Parameters
+----------
+metric : str or sub ref
+    The name of the metric, or a function
+    providing statistics given pred, label NDArray.
+=cut
+
+my %metrics = qw/
+    acc            AI::MXNet::Accuracy
+    accuracy       AI::MXNet::Accuracy
+    ce             AI::MXNet::CrossEntropy
+    f1             AI::MXNet::F1
+    mae            AI::MXNet::MAE
+    mse            AI::MXNet::MSE
+    rmse           AI::MXNet::RMSE
+    top_k_accuracy AI::MXNet::TopKAccuracy
+    Perplexity     AI::MXNet::Perplexity
+    perplexity     AI::MXNet::Perplexity
+/;
+
+method create(Metric|ArrayRef[Metric] $metric, %kwargs)
+{
+    Carp::confess("metric must be defined") unless defined $metric;
+    if(my $ref = ref $metric)
+    {
+        if($ref eq 'ARRAY')
+        {
+            my $composite_metric = AI::MXNet::CompositeEvalMetric->new();
+            for my $child_metric (@{ $metric })
+            {
+                $composite_metric->add(__PACKAGE__->create($child_metric, %kwargs))
+            }
+            return $composite_metric;
+        }
+        else
+        {
+            return AI::MXNet::CustomMetric->new(eval_function => $metric, %kwargs);
+        }
+    }
+    else
+    {
+        if(not exists $metrics{ lc($metric) })
+        {
+            my @metrics = keys %metrics;
+            Carp::confess("Metric must be either subref or one of [@metrics]");
+        }
+        return $metrics{ lc($metric) }->new(%kwargs);
+    }
+}
+
+{
+    no strict 'refs';
+    no warnings 'redefine';
+    for my $metric (values %metrics)
+    {
+        my ($name) = $metric =~ /(\w+)$/;
+        *{__PACKAGE__."::$name"} = sub { shift; $metric->new(@_); };
+    }
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
new file mode 100644
index 000000000000..f8607595d407
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
@@ -0,0 +1,899 @@
+## TODO
+## this class is here because of https://github.com/gfx/p5-Mouse/pull/67
+## once 2.4.7 version of Mouse in Ubuntu for affected Perl version
+## these accessors should be merged into main class
+
+package AI::MXNet::Module::Private;
+use Mouse;
+has [qw/_param_names _fixed_param_names
+        _aux_names _data_names _label_names _state_names
+        _output_names _arg_params _aux_params
+        _params_dirty _optimizer _kvstore
+         _update_on_kvstore _updater _work_load_list
+        _preload_opt_states _exec_group
+        _data_shapes _label_shapes _context _grad_req/
+] => (is => 'rw', init_arg => undef);
+
+package AI::MXNet::Module;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use List::Util qw(max);
+use Mouse;
+
+func _create_kvstore(
+    Maybe[Str|AI::MXNet::KVStore] $kvstore,
+    Int                           $num_device,
+    HashRef[AI::MXNet::NDArray]   $arg_params
+)
+{
+    my $update_on_kvstore = 1;
+    my $kv;
+    if(defined $kvstore)
+    {
+        if(blessed $kvstore)
+        {
+            $kv = $kvstore;
+        }
+        else
+        {
+            # create kvstore using the string type
+            if($num_device == 1 and $kvstore !~ /dist/)
+            {
+                # no need to use kv for single device and single machine
+            }
+            else
+            {
+                $kv = AI::MXNet::KVStore->create($kvstore);
+                if($kvstore eq 'local')
+                {
+                    # automatically select a proper local
+                    my $max_size = max(map { product(@{ $_->shape }) } values %{ $arg_params });
+                    if($max_size > 1024 * 1024 * 16)
+                    {
+                        $update_on_kvstore = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    $update_on_kvstore = 0 if not $kv;
+    return ($kv, $update_on_kvstore);
+}
+
+func _initialize_kvstore(
+    AI::MXNet::KVStore           :$kvstore,
+    HashRef[AI::MXNet::NDArray]  :$arg_params,
+    ArrayRef[Str]                :$param_names,
+    Bool                         :$update_on_kvstore,
+    ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] :$param_arrays
+)
+{
+    enumerate(sub{
+        my ($idx, $param_on_devs) = @_;
+        $kvstore->init($idx, $arg_params->{ $param_names->[$idx] });
+        if($update_on_kvstore)
+        {
+            $kvstore->pull($idx, out => $param_on_devs, priority => -$idx);
+        }
+    }, $param_arrays);
+}
+
+func _update_params_on_kvstore(
+    ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $param_arrays,
+    ArrayRef[AI::MXNet::NDArray]|ArrayRef[ArrayRef[AI::MXNet::NDArray]] $grad_arrays,
+    AI::MXNet::KVStore           $kvstore
+)
+{
+    enumerate(sub{
+        my ($index, $arg_list, $grad_list) = @_;
+        if(ref $grad_list eq 'ARRAY' and not defined $grad_list->[0])
+        {
+            return;
+        }
+        # push gradient, priority is negative index
+        $kvstore->push($index, $grad_list, priority => -$index);
+        # pull back the weights
+        $kvstore->pull($index, out => $arg_list, priority  => -$index);
+    }, $param_arrays, $grad_arrays);
+}
+
+func _update_params(
+    ArrayRef[ArrayRef[AI::MXNet::NDArray]] $param_arrays,
+    ArrayRef[ArrayRef[AI::MXNet::NDArray]] $grad_arrays,
+    AI::MXNet::Updater                     $updater,
+    Int                                    $num_device,
+    Maybe[AI::MXNet::KVStore]              $kvstore=
+)
+{
+    enumerate(sub{
+        my ($index, $arg_list, $grad_list) = @_;
+        if(not defined $grad_list->[0])
+        {
+            return;
+        }
+        if($kvstore)
+        {
+            # push gradient, priority is negative index
+            $kvstore->push($index, $grad_list, priority => -$index);
+            # pull back the sum gradients, to the same locations.
+            $kvstore->pull($index, out => $grad_list, priority => -$index);
+        }
+        enumerate(sub {
+            my ($k, $w, $g) = @_;
+            # faked an index here, to make optimizer create diff
+            # state for the same index but on diff devs, TODO(mli)
+            # use a better solution latter
+            &{$updater}($index*$num_device+$k, $g, $w);
+        }, $arg_list, $grad_list);
+    }, $param_arrays, $grad_arrays);
+}
+
+method load_checkpoint(Str $prefix, Int $epoch)
+{
+    my $symbol = AI::MXNet::Symbol->load("$prefix-symbol.json");
+    my %save_dict = %{ AI::MXNet::NDArray->load(sprintf('%s-%04d.params', $prefix, $epoch)) };
+    my %arg_params;
+    my %aux_params;
+    while(my ($k, $v) = each %save_dict)
+    {
+        my ($tp, $name) = split(/:/, $k, 2);
+        if($tp eq 'arg')
+        {
+            $arg_params{$name} = $v;
+        }
+        if($tp eq 'aux')
+        {
+            $aux_params{$name} = $v;
+        }
+    }
+    return ($symbol, \%arg_params, \%aux_params);
+}
+
+=head1 NAME
+
+AI::MXNet::Module - FeedForward interface of MXNet.
+See AI::MXNet::Module::Base for the details.
+=cut
+
+extends 'AI::MXNet::Module::Base';
+
+has '_symbol'           => (is => 'ro', init_arg => 'symbol', isa => 'AI::MXNet::Symbol', required => 1);
+has '_data_names'       => (is => 'ro', init_arg => 'data_names', isa => 'ArrayRef[Str]');
+has '_label_names'      => (is => 'ro', init_arg => 'label_names', isa => 'Maybe[ArrayRef[Str]]');
+has 'work_load_list'    => (is => 'rw', isa => 'Maybe[ArrayRef[Int]]');
+has 'fixed_param_names' => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
+has 'state_names'       => (is => 'rw', isa => 'Maybe[ArrayRef[Str]]');
+has 'logger'            => (is => 'ro', default => sub { AI::MXNet::Logging->get_logger });
+has '_p'                => (is => 'rw', init_arg => undef);
+has 'context'           => (
+    is => 'ro', 
+    isa => 'AI::MXNet::Context|ArrayRef[AI::MXNet::Context]',
+    default => sub { AI::MXNet::Context->cpu }
+);
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_%2)
+    {
+        my $symbol = shift;
+        return $class->$orig(symbol => $symbol, @_);
+    }
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_p(AI::MXNet::Module::Private->new);
+    my $context = $self->context;
+    if(blessed $context)
+    {
+        $context = [$context];
+    }
+    $self->_p->_context($context);
+    my $work_load_list = $self->work_load_list;
+    if(not defined $work_load_list)
+    {
+        $work_load_list = [(1)x@{$self->_p->_context}];
+    }
+    assert(@{ $work_load_list } == @{ $self->_p->_context });
+    $self->_p->_work_load_list($work_load_list);
+    my @data_names  = @{ $self->_data_names//['data'] };
+    my @label_names = @{ $self->_label_names//['softmax_label'] };
+    my @state_names = @{ $self->state_names//[] };
+    my $arg_names   = $self->_symbol->list_arguments;
+    my @input_names = (@data_names, @label_names, @state_names);
+    my %input_names = map { $_ => 1 } @input_names;
+    $self->_p->_param_names([grep { not exists $input_names{$_} } @{ $arg_names }]);
+    $self->_p->_fixed_param_names($self->fixed_param_names//[]);
+    $self->_p->_state_names(\@state_names);
+    $self->_p->_aux_names($self->_symbol->list_auxiliary_states);
+    $self->_p->_data_names(\@data_names);
+    $self->_p->_label_names(\@label_names);
+    $self->_p->_output_names($self->_symbol->list_outputs);
+    $self->_p->_params_dirty(0);
+    $self->_check_input_names($self->_symbol, $self->_p->_data_names, "data", 1);
+    $self->_check_input_names($self->_symbol, $self->_p->_label_names, "label", 0);
+    $self->_check_input_names($self->_symbol, $self->_p->_state_names, "state", 1);
+    $self->_check_input_names($self->_symbol, $self->_p->_fixed_param_names, "fixed_param", 1);
+}
+
+method Module(@args) { return @args ?  __PACKAGE__->new(@args) : __PACKAGE__ }
+method BucketingModule(@args) { return AI::MXNet::Module::Bucketing->new(@args) }
+
+=head load
+
+        Create a model from previously saved checkpoint.
+
+        Parameters
+        ----------
+        prefix : str
+            path prefix of saved model files. You should have
+            "prefix-symbol.json", "prefix-xxxx.params", and
+            optionally "prefix-xxxx.states", where xxxx is the
+            epoch number.
+        epoch : int
+            epoch to load.
+        load_optimizer_states : bool
+            whether to load optimizer states. Checkpoint needs
+            to have been made with save_optimizer_states=True.
+        data_names : array ref of str
+            Default is ['data'] for a typical model used in image classification.
+        label_names : array ref of str
+            Default is ['softmax_label'] for a typical model used in image
+            classification.
+        logger : Logger
+            Default is AI::MXNet::Logging.
+        context : Context or list of Context
+            Default is cpu(0).
+        work_load_list : array ref of number
+            Default is undef, indicating an uniform workload.
+        fixed_param_names: array ref of str
+            Default is undef, indicating no network parameters are fixed.
+=cut
+
+method load(
+    Str $prefix,
+    Int $epoch,
+    Bool $load_optimizer_states=0,
+    %kwargs
+)
+{
+    my ($sym, $args, $auxs) = __PACKAGE__->load_checkpoint($prefix, $epoch);
+    my $mod = $self->new(symbol => $sym, %kwargs);
+    $mod->_p->_arg_params($args);
+    $mod->_p->_aux_params($auxs);
+    $mod->params_initialized(1);
+    if($load_optimizer_states)
+    {
+        $mod->_p->_preload_opt_states(sprintf('%s-%04d.states', $prefix, $epoch));
+    }
+    return $mod;
+}
+
+=head2 save_checkpoint
+
+Save current progress to checkpoint.
+Use mx->callback->module_checkpoint as epoch_end_callback to save during training.
+
+Parameters
+----------
+prefix : str
+    The file prefix to checkpoint to
+epoch : int
+    The current epoch number
+save_optimizer_states : bool
+    Whether to save optimizer states for continue training
+=cut
+
+
+method save_checkpoint(Str $prefix, Int $epoch, Bool $save_optimizer_states=0)
+{
+    $self->_symbol->save("$prefix-symbol.json");
+    my $param_name = sprintf('%s-%04d.params', $prefix, $epoch);
+    $self->save_params($param_name);
+    AI::MXNet::Logging->info('Saved checkpoint to "%s"', $param_name);
+    if($save_optimizer_states)
+    {
+        my $state_name = sprintf('%s-%04d.states', $prefix, $epoch);
+        $self->save_optimizer_states($state_name);
+        AI::MXNet::Logging->info('Saved optimizer state to "%s"', $state_name);
+    }
+}
+
+=head2 model_save_checkpoint
+
+    Checkpoint the model data into file.
+
+    Parameters
+    ----------
+    prefix : str
+        Prefix of model name.
+    epoch : int
+        The epoch number of the model.
+    symbol : AI::MXNet::Symbol
+        The input symbol
+    arg_params : hash ref of str to AI::MXNet::NDArray
+        Model parameter, hash ref of name to AI::MXNet::NDArray of net's weights.
+    aux_params : hash ref of str to NDArray
+        Model parameter, hash ref of name to AI::MXNet::NDArray of net's auxiliary states.
+    Notes
+    -----
+    - prefix-symbol.json will be saved for symbol.
+    - prefix-epoch.params will be saved for parameters.
+=cut
+
+method model_save_checkpoint(
+    Str                         $prefix,
+    Int                         $epoch,
+    Maybe[AI::MXNet::Symbol]    $symbol,
+    HashRef[AI::MXNet::NDArray] $arg_params,
+    HashRef[AI::MXNet::NDArray] $aux_params
+)
+{
+    if(defined $symbol)
+    {
+        $symbol->save("$prefix-symbol.json");
+    }
+    my $param_name = sprintf('%s-%04d.params', $prefix, $epoch);
+    $self->save_params($param_name, $arg_params, $aux_params);
+    AI::MXNet::Logging->info('Saved checkpoint to "%s"', $param_name);
+}
+
+# Internal function to reset binded state.
+method _reset_bind()
+{
+    $self->binded(0);
+    $self->_p->_exec_group(undef);
+    $self->_p->_data_shapes(undef);
+    $self->_p->_label_shapes(undef);
+}
+
+method data_names()
+{
+    return $self->_p->_data_names;
+}
+
+method label_names()
+{
+    return $self->_p->_label_names;
+}
+
+method output_names()
+{
+    return $self->_p->_output_names;
+}
+
+method data_shapes()
+{
+    assert($self->binded);
+    return $self->_p->_data_shapes;
+}
+
+method label_shapes()
+{
+    assert($self->binded);
+    return $self->_p->_label_shapes;
+}
+
+method output_shapes()
+{
+    assert($self->binded);
+    return $self->_p->_exec_group->get_output_shapes;
+}
+
+method get_params()
+{
+    assert($self->binded and $self->params_initialized);
+    if($self->_p->_params_dirty)
+    {
+        $self->_sync_params_from_devices();
+    }
+    return ($self->_p->_arg_params, $self->_p->_aux_params);
+}
+
+method init_params(
+    Maybe[AI::MXNet::Initializer]      :$initializer=AI::MXNet::Initializer->Uniform(scale => 0.01),
+    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
+    Bool                               :$allow_missing=0,
+    Bool                               :$force_init=0
+)
+{
+    if($self->params_initialized and not $force_init)
+    {
+        AI::MXNet::Logging->warning(
+            "Parameters already initialized and force_init=0. "
+            ."init_params call ignored."
+        );
+        return;
+    }
+    assert($self->binded, 'call bind before initializing the parameters');
+    my $_impl = sub {
+            my ($name, $arr, $cache) = @_;
+            # Internal helper for parameter initialization
+            if(defined $cache)
+            {
+                if(exists $cache->{$name})
+                {
+                    my $cache_arr = $cache->{$name};
+                    # just in case the cached array is just the target itself
+                    if($cache_arr->handle ne $arr->handle)
+                    {
+                        $cache_arr->copyto($arr);
+                    }
+                }
+                else
+                {
+                    if(not $allow_missing)
+                    {
+                        confess("$name is not presented");
+                    }
+                    if(defined $initializer)
+                    {
+                        &{$initializer}($name, $arr);
+                    }
+                }
+            }
+            else
+            {
+                &{$initializer}($name, $arr) if defined $initializer;
+            }
+    };
+    my $attrs = $self->_symbol->attr_dict;
+    while(my ($name, $arr) = each %{ $self->_p->_arg_params })
+    {
+        $_impl->(
+            AI::MXNet::InitDesc->new(
+                name  => $name,
+                ($attrs->{$name} ? (attrs => $attrs->{$name}) : ())
+            ),
+            $arr, $arg_params
+        );
+    }
+    while(my ($name, $arr) = each %{ $self->_p->_aux_params })
+    {
+        $_impl->(
+            AI::MXNet::InitDesc->new(
+                name  => $name,
+                ($attrs->{$name} ? (attrs => $attrs->{$name}) : ())
+            ),
+            $arr, $aux_params
+        );
+    }
+    $self->params_initialized(1);
+    $self->_p->_params_dirty(0);
+
+    # copy the initialized parameters to devices
+    $self->_p->_exec_group->set_params($self->_p->_arg_params, $self->_p->_aux_params);
+}
+
+method set_params(
+    HashRef[AI::MXNet::NDArray]  $arg_params,
+    HashRef[AI::MXNet::NDArray]  $aux_params,
+    Bool                        :$allow_missing=0,
+    Bool                        :$force_init=1
+)
+{
+    if(not $allow_missing)
+    {
+        $self->init_params(
+            arg_params    => $arg_params,    aux_params => $aux_params,
+            allow_missing => $allow_missing, force_init => $force_init
+        );
+        return;
+    }
+
+    if($self->params_initialized and not $force_init)
+    {
+        AI::MXNet::Logging->warning(
+            "Parameters already initialized and force_init=False. "
+            ."set_params call ignored."
+        );
+        return;
+    }
+    $self->_p->_exec_group->set_params($arg_params, $aux_params);
+    $self->_p->_params_dirty(1);
+    $self->params_initialized(1);
+}
+
+=head2 bind
+
+Bind the symbols to construct executors. This is necessary before one
+can perform computation with the module.
+
+Parameters
+----------
+:$data_shapes : ArrayRef[AI::MXNet::DataDesc|NameShape]
+    Typically is $data_iter->provide_data.
+:$label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
+    Typically is $data_iter->provide_label.
+:$for_training : bool
+    Default is 1. Whether the executors should be bind for training.
+:$inputs_need_grad : bool
+    Default is 0. Whether the gradients to the input data need to be computed.
+    Typically this is not needed. But this might be needed when implementing composition
+    of modules.
+:$force_rebind : bool
+    Default is 0. This function does nothing if the executors are already
+    binded. But with this 1, the executors will be forced to rebind.
+:$shared_module : Module
+    Default is undef. This is used in bucketing. When not undef, the shared module
+    essentially corresponds to a different bucket -- a module with different symbol
+    but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
+=cut
+
+method bind(
+    ArrayRef[AI::MXNet::DataDesc|NameShape]        :$data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]] :$label_shapes=,
+    Bool                                           :$for_training=1,
+    Bool                                           :$inputs_need_grad=0,
+    Bool                                           :$force_rebind=0,
+    Maybe[AI::MXNet::Module]                       :$shared_module=,
+    GradReq|HashRef[GradReq]|ArrayRef[GradReq]     :$grad_req='write',
+    Maybe[ArrayRef[Str]]                           :$state_names=$self->_p->_state_names
+)
+{
+    # force rebinding is typically used when one want to switch from
+    # training to prediction phase.
+    if($force_rebind)
+    {
+        $self->_reset_bind();
+    }
+    if($self->binded)
+    {
+        $self->logger->warning('Already binded, ignoring bind()');
+        return;
+    }
+    $self->for_training($for_training);
+    $self->inputs_need_grad($inputs_need_grad);
+    $self->binded(1);
+    $self->_p->_grad_req($grad_req);
+
+    if(not $for_training)
+    {
+        assert(not $inputs_need_grad);
+    }
+    ($data_shapes, $label_shapes) = $self->_parse_data_desc(
+        $self->data_names, $self->label_names, $data_shapes, $label_shapes
+    );
+    $self->_p->_data_shapes($data_shapes);
+    $self->_p->_label_shapes($label_shapes);
+    my $shared_group;
+    if($shared_module)
+    {
+        assert($shared_module->binded and $shared_module->params_initialized);
+        $shared_group = $shared_module->_p->_exec_group;
+    }
+
+    $self->_p->_exec_group(
+        AI::MXNet::DataParallelExecutorGroup->new(
+            symbol            => $self->_symbol,
+            contexts          => $self->_p->_context,
+            workload          => $self->_p->_work_load_list,
+            data_shapes       => $self->_p->_data_shapes,
+            label_shapes      => $self->_p->_label_shapes,
+            param_names       => $self->_p->_param_names,
+            state_names       => $state_names,
+            for_training      => $for_training,
+            inputs_need_grad  => $inputs_need_grad,
+            shared_group      => $shared_group,
+            logger            => $self->logger,
+            fixed_param_names => $self->_p->_fixed_param_names,
+            grad_req          => $grad_req
+        )
+    );
+    if($shared_module)
+    {
+        $self->params_initialized(1);
+        $self->_p->_arg_params($shared_module->_p->_arg_params);
+        $self->_p->_aux_params($shared_module->_p->_aux_params);
+    }
+    elsif($self->params_initialized)
+    {
+        # if the parameters are already initialized, we are re-binding
+        # so automatically copy the already initialized params
+        $self->_p->_exec_group->set_params($self->_p->_arg_params, $self->_p->_aux_params);
+    }
+    else
+    {
+        assert(not defined $self->_p->_arg_params and not $self->_p->_aux_params);
+        my @param_arrays = (
+            map { AI::MXNet::NDArray->zeros($_->[0]->shape, dtype => $_->[0]->dtype) }
+            @{ $self->_p->_exec_group->_p->param_arrays }
+        );
+        my %arg_params;
+        @arg_params{ @{ $self->_p->_param_names } } = @param_arrays;
+        $self->_p->_arg_params(\%arg_params);
+        my @aux_arrays = (
+            map { AI::MXNet::NDArray->zeros($_->[0]->shape, dtype => $_->[0]->dtype) }
+            @{ $self->_p->_exec_group->_p->aux_arrays }
+        );
+        my %aux_params;
+        @aux_params{ @{ $self->_p->_aux_names } } = @aux_arrays;
+        $self->_p->_aux_params(\%aux_params);
+    }
+    if($shared_module and $shared_module->optimizer_initialized)
+    {
+        $self->borrow_optimizer($shared_module)
+    }
+}
+
+=head2 reshape
+
+Reshape the module for new input shapes.
+Parameters
+----------
+:$data_shapes : ArrayRef[AI::MXNet::DataDesc]
+    Typically is $data_iter->provide_data.
+:$label_shapes= : Maybe[ArrayRef[AI::MXNet::DataDesc]]
+    Typically is $data_iter->provide_label.
+=cut
+
+method reshape(
+    ArrayRef[AI::MXNet::DataDesc|NameShape]        :$data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]] :$label_shapes=
+)
+{
+    assert($self->binded);
+    ($data_shapes, $label_shapes) = $self->_parse_data_desc(
+        $self->data_names, $self->label_names, $data_shapes, $label_shapes
+    );
+    $self->_p->_data_shapes($data_shapes);
+    $self->_p->_label_shapes($label_shapes);
+    $self->_p->_exec_group->reshape($self->_p->_data_shapes, $self->_p->_label_shapes);
+}
+
+method init_optimizer(
+    Str|AI::MXNet::KVStore :$kvstore='local',
+    Optimizer              :$optimizer='sgd',
+    HashRef                :$optimizer_params={ learning_rate => 0.01 },
+    Bool                   :$force_init=0
+)
+{
+    assert($self->binded and $self->params_initialized);
+    if($self->optimizer_initialized and not $force_init)
+    {
+        $self->logger->warning('optimizer already initialized, ignoring...');
+        return;
+    }
+    if($self->_p->_params_dirty)
+    {
+        $self->_sync_params_from_devices;
+    }
+
+    my ($kvstore, $update_on_kvstore) = _create_kvstore(
+        $kvstore,
+        scalar(@{$self->_p->_context}),
+        $self->_p->_arg_params
+    );
+    my $batch_size = $self->_p->_exec_group->_p->batch_size;
+    if($kvstore and $kvstore->type =~ /dist/ and $kvstore->type =~ /_sync/)
+    {
+        $batch_size *= $kvstore->num_workers;
+    }
+    my $rescale_grad = 1/$batch_size;
+
+    if(not blessed $optimizer)
+    {
+        my %idx2name;
+        if($update_on_kvstore)
+        {
+            @idx2name{ 0..@{$self->_p->_exec_group->param_names}-1 } = @{$self->_p->_exec_group->param_names};
+        }
+        else
+        {
+            for my $k (0..@{$self->_p->_context}-1)
+            {
+                @idx2name{ map { $_ + $k } 0..@{$self->_p->_exec_group->param_names}-1 } = @{$self->_p->_exec_group->param_names};
+            }
+        }
+        if(not exists $optimizer_params->{rescale_grad})
+        {
+            $optimizer_params->{rescale_grad} = $rescale_grad;
+        }
+        $optimizer = AI::MXNet::Optimizer->create(
+            $optimizer,
+            sym  => $self->symbol,
+            param_idx2name => \%idx2name,
+            %{ $optimizer_params }
+        );
+        if($optimizer->rescale_grad != $rescale_grad)
+        {
+            AI::MXNet::Logging->warning(
+                "Optimizer created manually outside Module but rescale_grad "
+                ."is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "
+                ."Is this intended?",
+                $optimizer->rescale_grad, $rescale_grad
+            );
+        }
+    }
+
+    $self->_p->_optimizer($optimizer);
+    $self->_p->_kvstore($kvstore);
+    $self->_p->_update_on_kvstore($update_on_kvstore);
+    $self->_p->_updater(undef);
+
+    if($kvstore)
+    {
+        # copy initialized local parameters to kvstore
+        _initialize_kvstore(
+            kvstore           => $kvstore,
+            param_arrays      => $self->_p->_exec_group->_p->param_arrays,
+            arg_params        => $self->_p->_arg_params,
+            param_names       => $self->_p->_param_names,
+            update_on_kvstore => $update_on_kvstore
+        );
+    }
+    if($update_on_kvstore)
+    {
+        $kvstore->set_optimizer($self->_p->_optimizer);
+    }
+    else
+    {
+        $self->_p->_updater(AI::MXNet::Optimizer->get_updater($optimizer));
+    }
+    $self->optimizer_initialized(1);
+
+    if($self->_p->_preload_opt_states)
+    {
+        $self->load_optimizer_states($self->_p->_preload_opt_states);
+        $self->_p->_preload_opt_states(undef);
+    }
+}
+
+=head2 borrow_optimizer
+
+Borrow optimizer from a shared module. Used in bucketing, where exactly the same
+optimizer (esp. kvstore) is used.
+
+Parameters
+----------
+shared_module : AI::MXNet::Module
+=cut
+
+method borrow_optimizer(AI::MXNet::Module $shared_module)
+{
+    assert($shared_module->optimizer_initialized);
+    $self->_p->_optimizer($shared_module->_p->_optimizer);
+    $self->_p->_kvstore($shared_module->_p->_kvstore);
+    $self->_p->_update_on_kvstore($shared_module->_p->_update_on_kvstore);
+    $self->_p->_updater($shared_module->_p->_updater);
+    $self->optimizer_initialized(1);
+}
+
+method forward(
+    AI::MXNet::DataBatch $data_batch,
+    Maybe[Bool]         :$is_train=
+)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_p->_exec_group->forward($data_batch, $is_train);
+}
+
+method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grads=)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_p->_exec_group->backward($out_grads);
+}
+
+method update()
+{
+    assert($self->binded and $self->params_initialized and $self->optimizer_initialized);
+    $self->_p->_params_dirty(1);
+    if($self->_p->_update_on_kvstore)
+    {
+        _update_params_on_kvstore(
+            $self->_p->_exec_group->_p->param_arrays,
+            $self->_p->_exec_group->_p->grad_arrays,
+            $self->_p->_kvstore
+        );
+    }
+    else
+    {
+        _update_params(
+            $self->_p->_exec_group->_p->param_arrays,
+            $self->_p->_exec_group->_p->grad_arrays,
+            $self->_p->_updater,
+            scalar(@{ $self->_p->_context}),
+            $self->_p->_kvstore
+        );
+    }
+}
+
+method get_outputs(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized);
+    return $self->_p->_exec_group->get_outputs($merge_multi_context);
+}
+
+method get_input_grads(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized and $self->inputs_need_grad);
+    return $self->_p->_exec_group->get_input_grads($merge_multi_context);
+}
+
+method get_states(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized);
+    return $self->_p->_exec_group->get_states($merge_multi_context);
+}
+
+method set_states(:$states=, :$value=)
+{
+    assert($self->binded and $self->params_initialized);
+    return $self->_p->_exec_group->set_states($states, $value);
+}
+
+method update_metric(
+    AI::MXNet::EvalMetric $eval_metric,
+    ArrayRef[AI::MXNet::NDArray] $labels
+)
+{
+    $self->_p->_exec_group->update_metric($eval_metric, $labels);
+}
+
+=head2 _sync_params_from_devices
+
+Synchronize parameters from devices to CPU. This function should be called after
+calling 'update' that updates the parameters on the devices, before one can read the
+latest parameters from $self->_arg_params and $self->_aux_params.
+=cut
+
+method _sync_params_from_devices()
+{
+    $self->_p->_exec_group->get_params($self->_p->_arg_params, $self->_p->_aux_params);
+    $self->_p->_params_dirty(0);
+}
+
+method save_optimizer_states(Str $fname)
+{
+    assert($self->optimizer_initialized);
+    if($self->_p->_update_on_kvstore)
+    {
+        $self->_p->_kvstore->save_optimizer_states($fname);
+    }
+    else
+    {
+        open(F, ">:raw", "$fname") or confess("can't open $fname for writing: $!");
+        print F $self->_p->_updater->get_states();
+        close(F);
+    }
+}
+
+method load_optimizer_states(Str $fname)
+{
+    assert($self->optimizer_initialized);
+    if($self->_p->_update_on_kvstore)
+    {
+        $self->_p->_kvstore->load_optimizer_states($fname);
+    }
+    else
+    {
+        open(F, "<:raw", "$fname") or confess("can't open $fname for reading: $!");
+        my $data;
+        { local($/) = undef; $data = <F>; }
+        close(F);
+        $self->_p->_updater->set_states($data);
+    }
+}
+
+method install_monitor(AI::MXNet::Monitor $mon)
+{
+    assert($self->binded);
+    $self->_p->_exec_group->install_monitor($mon);
+}
+
+method _updater()
+{
+    $self->_p->_updater;
+}
+
+method _kvstore()
+{
+    $self->_p->_kvstore;
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
new file mode 100644
index 000000000000..70c9497d4d2e
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
@@ -0,0 +1,1030 @@
+package AI::MXNet::BatchEndParam;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+has [qw/epoch nbatch/] => (is => 'rw', isa => 'Int');
+has 'eval_metric'      => (is => 'rw', isa => 'AI::MXNet::EvalMetric');
+
+package AI::MXNet::Module::Base;
+use Mouse;
+use AI::MXNet::Base;
+use Time::HiRes qw(time);
+
+=head1 NAME
+
+AI::MXNet::Module::Base - Base class for AI::MXNet::Module and AI::MXNet::Module::Bucketing
+=cut
+
+func _as_list($obj)
+{
+    return [$obj] if ((ref($obj)//'') ne 'ARRAY');
+    return $obj;
+}
+
+# Check that all input names are in symbol's argument
+method _check_input_names(
+    AI::MXNet::Symbol $symbol,
+    ArrayRef[Str]     $names,
+    Str               $typename,
+    Bool              $throw
+)
+{
+    my @candidates;
+    my %args = map {
+        push @candidates, $_ if not /_(?:weight|bias|gamma|beta)$/;
+        $_ => 1
+    } @{ $symbol->list_arguments };
+    for my $name (@$names)
+    {
+        my $msg;
+        if(not exists $args{$name} and $name ne 'softmax_label')
+        {
+            $msg = sprintf("\033[91mYou created Module with Module(..., %s_names=%s) but "
+                ."input with name '%s' is not found in symbol.list_arguments(). "
+                ."Did you mean one of:\n\t%s\033[0m",
+                $typename, "@$names", $name, join("\n\t", @candidates)
+            );
+            if($throw)
+            {
+                confess($msg);
+            }
+            else
+            {
+                AI::MXNet::Logging->warning($msg);
+            }
+        }
+    }
+}
+
+# Check that input names matches input data descriptors
+method _check_names_match(
+    ArrayRef[Str]                  $data_names,
+    ArrayRef[NameShapeOrDataDesc]  $data_shapes,
+    Str                            $name,
+    Bool                           $throw
+)
+{
+    return if (not @$data_shapes and @$data_names == 1 and  $data_names->[0] eq 'softmax_label');
+    my @actual = map { @{$_}[0] } @{ $data_shapes };
+    if("@$data_names" ne "@actual")
+    {
+        my $msg = sprintf(
+            "Data provided by %s_shapes don't match names specified by %s_names (%s vs. %s)",
+            $name, $name, "@$data_shapes", "@$data_names"
+        );
+        if($throw)
+        {
+            confess($msg);
+        }
+        else
+        {
+            AI::MXNet::Logging->warning($msg);
+        }
+    }
+}
+
+method _parse_data_desc(
+    ArrayRef[Str]                                  $data_names,
+    Maybe[ArrayRef[Str]]                           $label_names,
+    ArrayRef[NameShapeOrDataDesc]                  $data_shapes,
+    Maybe[ArrayRef[NameShapeOrDataDesc]]           $label_shapes
+)
+{
+    $data_shapes = [map { blessed $_ ? $_ : AI::MXNet::DataDesc->new(@$_) } @$data_shapes];
+    $self->_check_names_match($data_names, $data_shapes, 'data', 1);
+    if($label_shapes)
+    {
+        $label_shapes = [map { blessed $_ ? $_ : AI::MXNet::DataDesc->new(@$_) } @$label_shapes];
+        $self->_check_names_match($label_names, $label_shapes, 'label', 0);
+    }
+    else
+    {
+        $self->_check_names_match($label_names, [], 'label', 0);
+    }
+    return ($data_shapes, $label_shapes);
+}
+
+=head1 DESCRIPTION
+
+The base class of a modules. A module represents a computation component. The design
+purpose of a module is that it abstract a computation "machine", that one can run forward,
+backward, update parameters, etc. We aim to make the APIs easy to use, especially in the
+case when we need to use imperative API to work with multiple modules (e.g. stochastic
+depth network).
+
+A module has several states:
+
+    - Initial state. Memory is not allocated yet, not ready for computation yet.
+    - Binded. Shapes for inputs, outputs, and parameters are all known, memory allocated,
+    ready for computation.
+    - Parameter initialized. For modules with parameters, doing computation before initializing
+    the parameters might result in undefined outputs.
+    - Optimizer installed. An optimizer can be installed to a module. After this, the parameters
+    of the module can be updated according to the optimizer after gradients are computed
+    (forward-backward).
+
+In order for a module to interact with others, a module should be able to report the
+following information in its raw stage (before binded)
+
+    - data_names: array ref of string indicating the names of required data.
+    - output_names: array ref of string indicating the names of required outputs.
+
+And also the following richer information after binded:
+
+- state information
+    - binded: bool, indicating whether the memory buffers needed for computation
+    has been allocated.
+    - for_training: whether the module is binded for training (if binded).
+    - params_initialized: bool, indicating whether the parameters of this modules
+    has been initialized.
+    - optimizer_initialized: bool, indicating whether an optimizer is defined
+    and initialized.
+    - inputs_need_grad: bool, indicating whether gradients with respect to the
+    input data is needed. Might be useful when implementing composition of modules.
+
+- input/output information
+    - data_shapes: am array ref of [name, shape]. In theory, since the memory is allocated,
+    we could directly provide the data arrays. But in the case of data parallelization,
+    the data arrays might not be of the same shape as viewed from the external world.
+    - label_shapes: an array ref of [name, shape]. This might be [] if the module does
+    not need labels (e.g. it does not contains a loss function at the top), or a module
+    is not binded for training.
+    - output_shapes: an array ref of [name, shape] for outputs of the module.
+
+- parameters (for modules with parameters)
+    - get_params(): return an array ($arg_params, $aux_params). Each of those
+    is a hash ref of name to NDArray mapping. Those NDArrays always on
+    CPU. The actual parameters used for computing might be on other devices (GPUs),
+    this function will retrieve (a copy of) the latest parameters. Therefore, modifying
+    - get_params($arg_params, $aux_params): assign parameters to the devices
+    doing the computation.
+    - init_params(...): a more flexible interface to assign or initialize the parameters.
+
+- setup
+    - bind(): prepare environment for computation.
+    - init_optimizer(): install optimizer for parameter updating.
+
+- computation
+    - forward(data_batch): forward operation.
+    - backward(out_grads=): backward operation.
+    - update(): update parameters according to installed optimizer.
+    - get_outputs(): get outputs of the previous forward operation.
+    - get_input_grads(): get the gradients with respect to the inputs computed
+    in the previous backward operation.
+    - update_metric(metric, labels): update performance metric for the previous forward
+    computed results.
+
+- other properties (mostly for backward compatability)
+    - symbol: the underlying symbolic graph for this module (if any)
+    This property is not necessarily constant. For example, for AI::MXNet::Module::Bucketing,
+    this property is simply the *current* symbol being used. For other modules,
+    this value might not be well defined.
+
+When those intermediate-level API are implemented properly, the following
+high-level API will be automatically available for a module:
+
+    - fit: train the module parameters on a data set
+    - predict: run prediction on a data set and collect outputs
+    - score: run prediction on a data set and evaluate performance
+=cut
+
+has 'logger'            => (is => 'rw', default => sub { AI::MXNet::Logging->get_logger });
+has '_symbol'           => (is => 'rw', init_arg => 'symbol', isa => 'AI::MXNet::Symbol');
+has [
+    qw/binded for_training inputs_need_grad
+    params_initialized optimizer_initialized/
+]                       => (is => 'rw', isa => 'Bool', init_arg => undef, default => 0);
+
+################################################################################
+# High Level API
+################################################################################
+
+=head2 forward_backward
+
+A convenient function that calls both forward and backward.
+=cut
+
+method forward_backward(AI::MXNet::DataBatch $data_batch)
+{
+    $self->forward($data_batch, is_train => 1);
+    $self->backward();
+}
+
+=head2 score
+
+Run prediction on eval_data and evaluate the performance according to
+eval_metric.
+
+Parameters
+----------
+$eval_data   : AI::MXNet::DataIter
+$eval_metric : AI::MXNet::EvalMetric
+:$num_batch= : Maybe[Int]
+    Number of batches to run. Default is undef, indicating run until the AI::MXNet::DataIter
+    finishes.
+:$batch_end_callback= : Maybe[Callback]
+    Could also be a array ref of functions.
+:$reset=1 : Bool
+    Default 1, indicating whether we should reset $eval_data before starting
+    evaluating.
+$epoch=0 : Int
+    Default is 0. For compatibility, this will be passed to callbacks (if any). During
+    training, this will correspond to the training epoch number.
+=cut
+
+method score(
+    AI::MXNet::DataIter $eval_data,
+    EvalMetric          $eval_metric,
+    Maybe[Int]         :$num_batch=,
+    Maybe[Callback]    :$batch_end_callback=,
+    Maybe[Callback]    :$score_end_callback=,
+    Bool               :$reset=1,
+    Int                :$epoch=0
+)
+{
+    assert($self->binded and $self->params_initialized);
+    $eval_data->reset if $reset;
+    if(not blessed $eval_metric or not $eval_metric->isa('AI::MXNet::EvalMetric'))
+    {
+        $eval_metric = AI::MXNet::Metric->create($eval_metric);
+    }
+
+    $eval_metric->reset();
+    my $actual_num_batch = 0;
+    my $nbatch = 0;
+    while(my $eval_batch = <$eval_data>)
+    {
+        last if (defined $num_batch and $nbatch == $num_batch);
+        $self->forward($eval_batch, is_train => 0);
+        $self->update_metric($eval_metric, $eval_batch->label);
+
+        if (defined $batch_end_callback)
+        {
+            my $batch_end_params = AI::MXNet::BatchEndParam->new(
+                epoch  => $epoch,
+                nbatch => $nbatch,
+                eval_metric => $eval_metric
+            );
+            for my $callback (@{ _as_list($batch_end_callback) })
+            {
+                &{$callback}($batch_end_params);
+            }
+        }
+        $actual_num_batch++;
+        $nbatch++
+    }
+    if($score_end_callback)
+    {
+        my $params = AI::MXNet::BatchEndParam->new(
+            epoch  => $epoch,
+            nbatch => $actual_num_batch,
+            eval_metric => $eval_metric,
+        );
+        for my $callback (@{ _as_list($score_end_callback) })
+        {
+            &{callback}($params);
+        }
+    }
+    return $eval_metric->get_name_value;
+}
+
+=head2  iter_predict
+
+Iterate over predictions.
+
+Parameters
+----------
+$eval_data : AI::MXNet::DataIter
+:$num_batch= : Maybe[Int]
+    Default is undef, indicating running all the batches in the data iterator.
+:$reset=1 : bool
+    Default is 1, indicating whether we should reset the data iter before start
+    doing prediction.
+=cut
+
+method iter_predict(AI::MXNet::DataIter $eval_data, Maybe[Int] :$num_batch=, Bool :$reset=1)
+{
+    assert($self->binded and $self->params_initialized);
+    if($reset)
+    {
+        $eval_data->reset;
+    }
+    my $nbatch = 0;
+    my @out;
+    while(my $eval_batch = <$eval_data>)
+    {
+        last if defined $num_batch and $nbatch == $num_batch;
+        $self->forward($eval_batch, is_train => 0);
+        my $pad = $eval_batch->pad;
+        my $outputs = [
+            map { $_->slice([0, $_->shape->[0] - ($pad//0) - 1]) } @{ $self->get_outputs() }
+        ];
+        push @out, [$outputs, $nbatch, $eval_batch];
+        $nbatch++;
+    }
+    return @out;
+}
+
+=head2 predict
+
+Run prediction and collect the outputs.
+
+Parameters
+----------
+$eval_data  : AI::MXNet::DataIter
+:$num_batch= : Maybe[Int]
+    Default is undef, indicating running all the batches in the data iterator.
+:$merge_batches=1 : Bool
+    Default is 1.
+:$reset=1 : Bool
+    Default is 1, indicating whether we should reset the data iter before start
+    doing prediction.
+:$always_output_list=0 : Bool
+Default is 0, see the doc for return values.
+
+Returns
+-------
+When $merge_batches is 1 (by default), the return value will be a array ref
+[$out1, $out2, $out3].  Where each element is concatenation of the outputs for
+all the mini-batches. If $always_output_list` also is 0 (by default),
+then in the case of a single output, $out1 is returned in stead of [$out1].
+
+When $merge_batches is 0, the return value will be a nested array ref like
+[[$out1_batch1, $out2_batch1], [$out1_batch2], ...]. This mode is useful because
+in some cases (e.g. bucketing), the module does not necessarily produce the same
+number of outputs.
+
+The objects in the results are AI::MXNet::NDArray`s. If you need to work with pdl array,
+just call ->aspdl() on each of the AI::MXNet::NDArray.
+=cut
+
+method predict(
+    AI::MXNet::DataIter $eval_data,
+    Maybe[Int] :$num_batch=, Bool :$merge_batches=1, Bool :$reset=1, Bool :$always_output_list=0
+)
+{
+    assert($self->binded and $self->params_initialized);
+    $eval_data->reset() if $reset;
+
+    my @output_list;
+    my $nbatch = 0;
+    while(my $eval_batch = <$eval_data>)
+    {
+        last if defined $num_batch and $nbatch == $num_batch;
+        $self->forward($eval_batch, is_train => 0);
+        my $pad = $eval_batch->pad;
+        my $outputs = [map { $_->slice([0, $_->shape0->[0]-($pad//0)-1])->copy } @{ $self->get_outputs }];
+        push @output_list, $outputs;
+    }
+    return () unless @output_list;
+    if($merge_batches)
+    {
+        my $num_outputs = @{ $output_list[0] };
+        for my $out (@output_list)
+        {
+            unless(@{ $out } == $num_outputs)
+            {
+                confess('Cannot merge batches, as num of outputs is not the same '
+                       .'in mini-batches. Maybe bucketing is used?');
+            }
+        }
+        my @output_list2;
+        for my $i (0..$num_outputs-1)
+        {
+            push @output_list2,
+                 AI::MXNet::NDArray->concatenate([map { $_->[$i] } @output_list]);
+        }
+        if($num_outputs == 1 and not $always_output_list)
+        {
+            return $output_list2[0];
+        }
+        return @output_list2;
+    }
+    return @output_list;
+}
+
+=head2 fit
+
+Train the module parameters.
+
+Parameters
+----------
+$train_data : AI::MXNet::DataIter
+:$eval_data= : Maybe[AI::MXNet::DataIter]
+    If not undef, it will be used as a validation set to evaluate the performance
+    after each epoch.
+:$eval_metric='acc' : str or AI::MXNet::EvalMetric subclass object.
+    Default is 'accuracy'. The performance measure used to display during training.
+    Other possible predefined metrics are:
+    'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'
+:$epoch_end_callback= : Maybe[Callback] function or array ref of functions.
+    Each callback will be called with the current $epoch, $symbol, $arg_params
+    and $aux_params.
+:$batch_end_callback= : Maybe[Callback] function or array ref of functions.
+    Each callback will be called with a AI::MXNet::BatchEndParam.
+:$kvstore='local' : str or AI::MXNet::KVStore
+    Default is 'local'.
+:$optimizer : str or AI::MXNet::Optimizer
+    Default is 'sgd'
+:$optimizer_params : hash ref
+    Default { learning_rate => 0.01 }. 
+    The parameters for the optimizer constructor.
+:$eval_end_callback= : Maybe[Callback] function or array ref of functions
+    These will be called at the end of each full evaluation, with the metrics over
+    the entire evaluation set.
+:$eval_batch_end_callback : Maybe[Callback] function or array ref of functions
+    These will be called at the end of each minibatch during evaluation
+:$initializer= : Initializer
+    Will be called to initialize the module parameters if not already initialized.
+:$arg_params= : hash ref
+    Default undef, if not undef, must be an existing parameters from a trained
+    model or loaded from a checkpoint (previously saved model). In this case,
+    the value here will be used to initialize the module parameters, unless they
+    are already initialized by the user via a call to init_params or fit.
+    $arg_params` have higher priority than the $initializer.
+:$aux_params= : hash ref
+    Default is undef. This is similar to the $arg_params, except for auxiliary states.
+:$allow_missing=0 : Bool
+    Default is 0. Indicates whether we allow missing parameters when $arg_params
+    and $aux_params are not undefined. If this is 1, then the missing parameters
+    will be initialized via the $initializer.
+:$force_rebind=0 : Bool
+    Default is 0. Whether to force rebinding the executors if already binded.
+:$force_init=0 : Bool
+    Default is 0. Indicates whether we should force initialization even if the
+    parameters are already initialized.
+:$begin_epoch=0 : Int
+    Default is 0. Indicates the starting epoch. Usually, if we are resuming from a
+    checkpoint saved at a previous training phase at epoch N, then we should specify
+    this value as N+1.
+:$num_epoch : Int
+    Number of epochs for the training.
+=cut
+
+
+method fit(
+    AI::MXNet::DataIter                 $train_data,
+    Maybe[AI::MXNet::DataIter]         :$eval_data=,
+    EvalMetric                         :$eval_metric='acc',
+    Maybe[Callback]                    :$epoch_end_callback=,
+    Maybe[Callback]                    :$batch_end_callback=,
+    Str                                :$kvstore='local',
+    Optimizer                          :$optimizer='sgd',
+    HashRef                            :$optimizer_params={ learning_rate => 0.01 },
+    Maybe[Callback]                    :$eval_end_callback=,
+    Maybe[Callback]                    :$eval_batch_end_callback=,
+    AI::MXNet::Initializer             :$initializer=AI::MXNet::Initializer->Uniform(scale => 0.01),
+    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
+    Bool                               :$allow_missing=0,
+    Bool                               :$force_rebind=0,
+    Bool                               :$force_init=0,
+    Int                                :$begin_epoch=0,
+    Int                                :$num_epoch,
+    Maybe[EvalMetric]                  :$validation_metric=,
+    Maybe[AI::MXNet::Monitor]          :$monitor=
+)
+{
+    $self->bind(
+        data_shapes  => $train_data->provide_data,
+        label_shapes => $train_data->provide_label,
+        for_training => 1,
+        force_rebind => $force_rebind
+    );
+    if($monitor)
+    {
+        $self->install_monitor($monitor);
+    }
+    $self->init_params(
+        initializer   => $initializer,
+        arg_params    => $arg_params,
+        aux_params    => $aux_params,
+        allow_missing => $allow_missing,
+        force_init    => $force_init
+    );
+    $self->init_optimizer(
+        kvstore          => $kvstore,
+        optimizer        => $optimizer,
+        optimizer_params => $optimizer_params
+    );
+
+    if(not defined $validation_metric)
+    {
+        $validation_metric = $eval_metric;
+    }
+    $eval_metric = AI::MXNet::Metric->create($eval_metric)
+        unless blessed $eval_metric;
+
+    ################################################################################
+    # training loop
+    ################################################################################
+    for my $epoch ($begin_epoch..$num_epoch-1)
+    {
+        my $tic = time;
+        $eval_metric->reset;
+        my $nbatch = 0;
+        my $end_of_batch = 0;
+        my $next_data_batch = <$train_data>;
+        while(not $end_of_batch)
+        {
+            my $data_batch = $next_data_batch;
+            $monitor->tic if $monitor;
+            $self->forward_backward($data_batch);
+            $self->update;
+            $next_data_batch = <$train_data>;
+            if(defined $next_data_batch)
+            {
+                $self->prepare($next_data_batch);
+            }
+            else
+            {
+                $end_of_batch = 1;
+            }
+            $self->update_metric($eval_metric, $data_batch->label);
+            $monitor->toc_print if $monitor;
+            if(defined $batch_end_callback)
+            {
+                my $batch_end_params = AI::MXNet::BatchEndParam->new(
+                    epoch       => $epoch,
+                    nbatch      => $nbatch,
+                    eval_metric => $eval_metric
+                );
+                for my $callback (@{ _as_list($batch_end_callback) })
+                {
+                    &{$callback}($batch_end_params);
+                }
+            }
+            $nbatch++;
+        }
+        # one epoch of training is finished
+        my $name_value = $eval_metric->get_name_value;
+        while(my ($name, $val) = each %{ $name_value })
+        {
+            $self->logger->info('Epoch[%d] Train-%s=%f', $epoch, $name, $val);
+        }
+        my $toc = time;
+        $self->logger->info('Epoch[%d] Time cost=%.3f', $epoch, ($toc-$tic));
+
+        # sync aux params across devices
+        my ($arg_params, $aux_params) = $self->get_params;
+        $self->set_params($arg_params, $aux_params);
+
+        if($epoch_end_callback)
+        {
+            for my $callback (@{ _as_list($epoch_end_callback) })
+            {
+                &{$callback}($epoch, $self->get_symbol, $arg_params, $aux_params);
+            }
+        }
+        #----------------------------------------
+        # evaluation on validation set
+        if(defined $eval_data)
+        {
+            my $res = $self->score(
+                $eval_data,
+                $validation_metric,
+                score_end_callback => $eval_end_callback,
+                batch_end_callback => $eval_batch_end_callback,
+                epoch              => $epoch
+            );
+            #TODO: pull this into default
+            while(my ($name, $val) = each %{ $res })
+            {
+                $self->logger->info('Epoch[%d] Validation-%s=%f', $epoch, $name, $val);
+            }
+        }
+        # end of 1 epoch, reset the data-iter for another epoch
+        $train_data->reset;
+    }
+}
+
+################################################################################
+# Symbol information
+################################################################################
+
+=head2 get_symbol
+
+The symbol used by this module.
+=cut
+method get_symbol() { $self->symbol }
+
+=head2 data_names
+
+An array ref of names for data required by this module.
+=cut
+method data_names() { confess("NotImplemented") }
+
+=head2 output_names
+
+An array ref of names for the outputs of this module.
+=cut
+method output_names() { confess("NotImplemented") }
+
+################################################################################
+# Input/Output information
+################################################################################
+
+=head2 data_shapes
+
+An array ref of AI::MXNet::DataDesc objects specifying the data inputs to this module.
+=cut
+method data_shapes() { confess("NotImplemented") }
+
+=head2 label_shapes
+
+A array ref of AI::MXNet::DataDesc objects specifying the label inputs to this module.
+If this module does not accept labels -- either it is a module without loss
+function, or it is not binded for training, then this should return an empty
+array ref.
+=cut
+method label_shapes() { confess("NotImplemented") }
+
+=head2 output_shapes
+
+An array ref of (name, shape) pairs specifying the outputs of this module.
+=cut
+method output_shapes() { confess("NotImplemented") }
+
+################################################################################
+# Parameters of a module
+################################################################################
+
+=head2 get_params
+
+The parameters, these are potentially a copies of the the actual parameters used
+to do computation on the device.
+
+Returns
+-------
+($arg_params, $aux_params), a pair of hash refs of name to value mapping.
+=cut
+
+method get_params() { confess("NotImplemented") }
+
+=head2 init_params
+
+Initialize the parameters and auxiliary states.
+
+Parameters
+----------
+:$initializer : Maybe[AI::MXNet::Initializer]
+    Called to initialize parameters if needed.
+:$arg_params= : Maybe[HashRef[AI::MXNet::NDArray]]
+    If not undef, should be a hash ref of existing arg_params.
+:$aux_params : Maybe[HashRef[AI::MXNet::NDArray]]
+    If not undef, should be a hash ref of existing aux_params.
+:$allow_missing=0 : Bool
+    If true, params could contain missing values, and the initializer will be
+    called to fill those missing params.
+$force_init=0 : Bool
+    If true, will force re-initialize even if already initialized.
+=cut
+
+method init_params(
+    Maybe[AI::MXNet::Initializer]      :$initializer=AI::MXNet::Initializer->Uniform(0.01),
+    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
+    Bool                               :$allow_missing=0,
+    Bool                               :$force_init=0
+)
+{
+    confess("NotImplemented");
+}
+
+=head2 set_params
+
+Assign parameter and aux state values.
+
+Parameters
+----------
+$arg_params= : Maybe[HashRef[AI::MXNet::NDArray]]
+    Hash ref of name to value (NDArray) mapping.
+$aux_params= : Maybe[HashRef[AI::MXNet::NDArray]]
+    Dictionary of name to value (`NDArray`) mapping.
+:$allow_missing=0 : Bool
+    If true, params could contain missing values, and the initializer will be
+    called to fill those missing params.
+:$force_init=0 : Bool
+    If true, will force re-initialize even if already initialized.
+=cut
+
+method set_params(
+    Maybe[HashRef[AI::MXNet::NDArray]]  $arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]]  $aux_params=,
+    Bool                               :$allow_missing=0,
+    Bool                               :$force_init=0
+)
+{
+    $self->init_params(
+        initializer   => undef,
+        arg_params    => $arg_params,
+        aux_params    => $aux_params,
+        allow_missing => $allow_missing,
+        force_init    => $force_init
+    );
+}
+
+=head2 save_params
+
+Save model parameters to file.
+
+Parameters
+----------
+$fname : str
+    Path to output param file.
+$arg_params= : Maybe[HashRef[AI::MXNet::NDArray]]
+$aux_params= : Maybe[HashRef[AI::MXNet::NDArray]]
+=cut
+
+method save_params(
+    Str $fname,
+    Maybe[HashRef[AI::MXNet::NDArray]] $arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]] $aux_params=
+)
+{
+    ($arg_params, $aux_params) = $self->get_params
+        unless (defined $arg_params and defined $aux_params);
+    my %save_dict;
+    while(my ($k, $v) = each %{ $arg_params })
+    {
+        $save_dict{"arg:$k"} = $v->as_in_context(AI::MXNet::Context->cpu);
+    }
+    while(my ($k, $v) = each %{ $aux_params })
+    {
+        $save_dict{"aux:$k"} = $v->as_in_context(AI::MXNet::Context->cpu);
+    }
+    AI::MXNet::NDArray->save($fname, \%save_dict);
+}
+
+=head2 load_params
+
+Load model parameters from file.
+
+Parameters
+----------
+$fname : str
+    Path to input param file.
+=cut
+
+method load_params(Str $fname)
+{
+    my %save_dict = %{ AI::MXNet::NDArray->load($fname) };
+    my %arg_params;
+    my %aux_params;
+    while(my ($k, $v) = each %save_dict)
+    {
+        my ($arg_type, $name) = split(/:/, $k, 2);
+        if($arg_type eq 'arg')
+        {
+            $arg_params{ $name } = $v;
+        }
+        elsif($arg_type eq 'aux')
+        {
+            $aux_params{ $name } = $v;
+        }
+        else
+        {
+            confess("Invalid param file $fname");
+        }
+    }
+    $self->set_params(\%arg_params, \%aux_params);
+}
+
+=head2 get_states
+
+The states from all devices
+
+Parameters
+----------
+$merge_multi_context=1 : Bool
+Default is true (1). In the case when data-parallelism is used, the states
+will be collected from multiple devices. A true value indicate that we
+should merge the collected results so that they look like from a single
+executor.
+
+Returns
+-------
+If merge_multi_context is 1, it is like [$out1, $out2]. Otherwise, it
+is like [[$out1_dev1, $out1_dev2], [$out2_dev1, $out2_dev2]]. All the output
+elements are AI::MXNet::NDArray.
+=cut
+
+method get_states(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized);
+    assert(not $merge_multi_context);
+    return [];
+}
+
+=head2 set_states
+
+Set value for states. Only one of states & value can be specified.
+
+Parameters
+----------
+states : array ref of array refs of NDArrays
+source states arrays formatted like [[$state1_dev1, $state1_dev2],
+            [$state2_dev1, $state2_dev2]].
+$value : Num
+    a single scalar value for all state arrays.
+=cut
+
+method set_states(Maybe[ArrayRef[ArrayRef[AI::MXNet::NDArray]]] $states=, Maybe[Num] $value=)
+{
+    assert($self->binded and $self->params_initialized);
+    assert(not $states and not $value);
+}
+
+
+=head2 install_monitor
+
+Install monitor on all executors
+
+Parameters
+----------
+$mon : AI::MXNet::Monitor
+=cut
+
+method install_monitor(AI::MXNet::Monitor $mon) { confess("NotImplemented") }
+
+=head2 prepare
+
+Prepare the module for processing a data batch.
+
+Usually involves switching bucket and reshaping.
+
+Parameters
+----------
+$data_batch : AI::MXNet::DataBatch
+=cut
+
+method prepare(AI::MXNet::DataBatch $data_batch){}
+
+################################################################################
+# Computations
+################################################################################
+
+=head2 forward
+
+Forward computation.
+
+Parameters
+----------
+$data_batch : DataBatch
+    Could be anything with similar API implemented.
+$is_train= : Bool
+    Default is undef, which means is_train takes the value of $self->for_training.
+=cut
+
+method forward(AI::MXNet::DataBatch $data_batch, Bool $is_train=) { confess("NotImplemented") }
+
+=head2 backward
+
+Backward computation.
+
+Parameters
+----------
+$out_grads : Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]], optional
+    Gradient on the outputs to be propagated back.
+    This parameter is only needed when bind is called
+    on outputs that are not a loss function.
+=cut
+
+method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grads=)
+{
+    confess("NotImplemented")
+}
+
+=head2 get_outputs
+
+The outputs of the previous forward computation.
+
+Parameters
+----------
+$merge_multi_context=1 : Bool
+=cut
+
+method get_outputs(Bool $merge_multi_context=1) { confess("NotImplemented") }
+
+=head2 get_input_grads
+
+The gradients to the inputs, computed in the previous backward computation.
+
+Parameters
+----------
+$merge_multi_context=1 : Bool
+=cut
+
+method get_input_grads(Bool $merge_multi_context=1) { confess("NotImplemented") }
+
+=head2 update
+
+Update parameters according to the installed optimizer and the gradients computed
+in the previous forward-backward batch.
+=cut
+
+method update() { confess("NotImplemented") }
+
+=head2 update_metric
+
+Evaluate and accumulate evaluation metric on outputs of the last forward computation.
+
+Parameters
+----------
+$eval_metric : EvalMetric
+$labels : ArrayRef[AI::MXNet::NDArray]
+    Typically $data_batch->label.
+=cut
+
+method update_metric(EvalMetric $eval_metric, ArrayRef[AI::MXNet::NDArray] $labels)
+{
+    confess("NotImplemented")
+}
+
+################################################################################
+# module setup
+################################################################################
+
+=head2 bind
+
+Binds the symbols in order to construct the executors. This is necessary
+before the computations can be performed.
+
+Parameters
+----------
+$data_shapes : ArrayRef[AI::MXNet::DataDesc]
+    Typically is $data_iter->provide_data.
+:$label_shapes= : Maybe[ArrayRef[AI::MXNet::DataDesc]]
+    Typically is $data_iter->provide_label.
+:$for_training=1 : Bool
+    Default is 1. Whether the executors should be bind for training.
+:$inputs_need_grad=0 : Bool
+    Default is 0. Whether the gradients to the input data need to be computed.
+    Typically this is not needed. But this might be needed when implementing composition
+    of modules.
+:$force_rebind=0 : Bool
+    Default is 0. This function does nothing if the executors are already
+    binded. But with this as 1, the executors will be forced to rebind.
+:$shared_module= : A subclass of AI::MXNet::Module::Base
+    Default is undef. This is used in bucketing. When not undef, the shared module
+    essentially corresponds to a different bucket -- a module with different symbol
+    but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
+:$grad_req='write' : Str|ArrayRef[Str]|HashRef[Str]
+    Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
+    (defaults to 'write').
+    Can be specified globally (str) or for each argument (array ref, hash ref).
+=cut
+
+method bind(
+    ArrayRef[AI::MXNet::DataDesc]         $data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc]] :$label_shapes=,
+    Bool                                 :$for_training=1,
+    Bool                                 :$inputs_need_grad=0,
+    Bool                                 :$force_rebind=0,
+    Maybe[AI::MXNet::BaseModule]         :$shared_module=,
+    Str|ArrayRef[Str]|HashRef[Str]       :$grad_req='write'
+)
+{
+    confess("NotImplemented")
+}
+
+=head2 init_optimizer
+
+Install and initialize optimizers.
+
+Parameters
+----------
+:$kvstore='local' : str or KVStore
+:$optimizer='sgd' : str or Optimizer
+:$optimizer_params={ learning_rate => 0.01 } : hash ref
+:$force_init=0 : Bool
+=cut
+
+method init_optimizer(
+    Str        :$kvstore='local',
+    Optimizer  :$optimizer='sgd',
+    HashRef    :$optimizer_params={ learning_rate => 0.01 },
+    Bool       :$force_init=0
+)
+{
+    confess("NotImplemented")
+}
+
+################################################################################
+# misc
+################################################################################
+
+=head2 symbol
+
+The symbol associated with this module.
+
+Except for AI::MXNet::Module, for other types of modules (e.g. AI::MXNet::Module::Bucketing), this
+property might not be a constant throughout its life time. Some modules might
+not even be associated with any symbols.
+=cut
+
+method symbol()
+{
+    return $self->_symbol;
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
new file mode 100644
index 000000000000..70d7b44e6ccc
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
@@ -0,0 +1,527 @@
+package AI::MXNet::Module::Bucketing;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Base;
+
+=encoding UTF-8
+
+=head1 NAME
+
+AI::MXNet::Module::Bucketing
+
+=head1 SYNOPSIS
+
+    my $buckets = [10, 20, 30, 40, 50, 60];
+    my $start_label   = 1;
+    my $invalid_label = 0;
+
+    my ($train_sentences, $vocabulary) = tokenize_text(
+        './data/ptb.train.txt', start_label => $start_label,
+        invalid_label => $invalid_label
+    );
+    my ($validation_sentences) = tokenize_text(
+        './data/ptb.test.txt', vocab => $vocabulary,
+        start_label => $start_label, invalid_label => $invalid_label
+    );
+    my $data_train  = mx->rnn->BucketSentenceIter(
+        $train_sentences, $batch_size, buckets => $buckets,
+        invalid_label => $invalid_label
+    );
+    my $data_val    = mx->rnn->BucketSentenceIter(
+        $validation_sentences, $batch_size, buckets => $buckets,
+        invalid_label => $invalid_label
+    );
+
+    my $stack = mx->rnn->SequentialRNNCell();
+    for my $i (0..$num_layers-1)
+    {
+        $stack->add(mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_l${i}_"));
+    }
+
+    my $sym_gen = sub {
+        my $seq_len = shift;
+        my $data  = mx->sym->Variable('data');
+        my $label = mx->sym->Variable('softmax_label');
+        my $embed = mx->sym->Embedding(
+            data => $data, input_dim => scalar(keys %$vocabulary),
+            output_dim => $num_embed, name => 'embed'
+        );
+        $stack->reset;
+        my ($outputs, $states) = $stack->unroll($seq_len, inputs => $embed, merge_outputs => 1);
+        my $pred = mx->sym->Reshape($outputs, shape => [-1, $num_hidden]);
+        $pred    = mx->sym->FullyConnected(data => $pred, num_hidden => scalar(keys %$vocabulary), name => 'pred');
+        $label   = mx->sym->Reshape($label, shape => [-1]);
+        $pred    = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
+        return ($pred, ['data'], ['softmax_label']);
+    };
+
+    my $contexts;
+    if(defined $gpus)
+    {
+        $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
+    }
+    else
+    {
+        $contexts = mx->cpu(0);
+    }
+
+    my $model = mx->mod->BucketingModule(
+        sym_gen             => $sym_gen,
+        default_bucket_key  => $data_train->default_bucket_key,
+        context             => $contexts
+    );
+
+    $model->fit(
+        $data_train,
+        eval_data           => $data_val,
+        eval_metric         => mx->metric->Perplexity($invalid_label),
+        kvstore             => $kv_store,
+        optimizer           => $optimizer,
+        optimizer_params    => {
+                                    learning_rate => $lr,
+                                    momentum      => $mom,
+                                    wd            => $wd,
+                            },
+        initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
+        num_epoch           => $num_epoch,
+        batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
+        ($chkp_epoch ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($stack, $chkp_prefix, $chkp_epoch)) : ())
+    );
+
+=head1 DESCRIPTION
+
+Implements the AI::MXNet::Module::Base API, and allows multiple
+symbols to be used depending on the `bucket_key` provided by each different
+mini-batch of data
+=cut
+
+
+=head2 new
+
+Parameters
+----------
+$sym_gen : subref or any perl object that overloads &{} op
+    A sub when called with a bucket key, returns a list with triple
+    of ($symbol, $data_names, $label_names).
+$default_bucket_key : str or anything else
+    The key for the default bucket.
+$logger : Logger
+$context : AI::MXNet::Context or array ref of AI::MXNet::Context objects
+    Default is cpu(0)
+$work_load_list : array ref of Num
+    Default is undef, indicating uniform workload.
+$fixed_param_names: arrayref of str
+    Default is undef, indicating no network parameters are fixed.
+$state_names : arrayref of str
+    states are similar to data and label, but not provided by data iterator.
+    Instead they are initialized to 0 and can be set by set_states()
+=cut
+
+extends 'AI::MXNet::Module::Base';
+has '_sym_gen'            => (is => 'ro', init_arg => 'sym_gen', required => 1);
+has '_default_bucket_key' => (is => 'rw', init_arg => 'default_bucket_key', required => 1);
+has '_context'            => (
+    is => 'ro', isa => 'AI::MXNet::Context|ArrayRef[AI::MXNet::Context]',
+    lazy => 1, default => sub { AI::MXNet::Context->cpu },
+    init_arg => 'context'
+);
+has '_work_load_list'     => (is => 'rw', init_arg => 'work_load_list', isa => 'ArrayRef[Num]');
+has '_curr_module'        => (is => 'rw', init_arg => undef);
+has '_curr_bucket_key'    => (is => 'rw', init_arg => undef);
+has '_buckets'            => (is => 'rw', init_arg => undef, default => sub { +{} });
+has '_fixed_param_names'  => (is => 'rw', isa => 'ArrayRef[Str]', init_arg => 'fixed_param_names');
+has '_state_names'        => (is => 'rw', isa => 'ArrayRef[Str]', init_arg => 'state_names');
+has '_params_dirty'       => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my ($self, $original_params) = @_;
+    $self->_fixed_param_names([]) unless defined $original_params->{fixed_param_names};
+    $self->_state_names([]) unless defined $original_params->{state_names};
+    $self->_params_dirty(0);
+    my ($symbol, $data_names, $label_names) = &{$self->_sym_gen}($self->_default_bucket_key);
+    $self->_check_input_names($symbol, $data_names//[], "data", 1);
+    $self->_check_input_names($symbol, $label_names//[], "label", 0);
+    $self->_check_input_names($symbol, $self->_state_names, "state", 1);
+    $self->_check_input_names($symbol, $self->_fixed_param_names, "fixed_param", 1);
+}
+
+method _reset_bind()
+{
+    $self->binded(0);
+    $self->_buckets({});
+    $self->_curr_module(undef);
+    $self->_curr_bucket_key(undef);
+}
+
+method data_names()
+{
+    if($self->binded)
+    {
+        return $self->_curr_module->data_names;
+    }
+    else
+    {
+        return (&{$self->_sym_gen}($self->_default_bucket_key))[1];
+    }
+}
+
+method output_names()
+{
+    if($self->binded)
+    {
+        return $self->_curr_module->ouput_names;
+    }
+    else
+    {
+        my ($symbol) = &{$self->_sym_gen}($self->_default_bucket_key);
+        return $symbol->list_ouputs;
+    }
+}
+
+method data_shapes()
+{
+    assert($self->binded);
+    return $self->_curr_module->data_shapes;
+}
+
+method label_shapes()
+{
+    assert($self->binded);
+    return $self->_curr_module->label_shapes;
+}
+
+method output_shapes()
+{
+    assert($self->binded);
+    return $self->_curr_module->output_shapes;
+}
+
+method get_params()
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_curr_module->_p->_params_dirty($self->_params_dirty);
+    my ($arg_params, $aux_params) = $self->_curr_module->get_params;
+    $self->_params_dirty(0);
+    return ($arg_params, $aux_params);
+}
+
+method set_params(
+    HashRef[AI::MXNet::NDArray] $arg_params,
+    HashRef[AI::MXNet::NDArray] $aux_params,
+    Bool                        $allow_missing=0,
+    Bool                        $force_init=1
+)
+{
+    if(not $allow_missing)
+    {
+        $self->init_params(
+            arg_params    => $arg_params,    aux_params => $aux_params,
+            allow_missing => $allow_missing, force_init => $force_init
+        );
+       return;
+    }
+    if($self->params_initialized and not $force_init)
+    {
+        AI::MXNet::Logging->warning(
+            "Parameters already initialized and force_init=False. "
+            ."set_params call ignored."
+        );
+        return;
+    }
+    $self->_curr_module->set_params(
+        $arg_params, $aux_params,
+        allow_missing => $allow_missing,
+        force_init    => $force_init
+    );
+    # because we didn't update self._arg_params, they are dirty now.
+    $self->_params_dirty(1);
+    $self->params_initialized(1);
+}
+
+method init_params(
+    AI::MXNet::Initializer             :$initializer=AI::MXNet::Initializer->Uniform(scale => 0.01),
+    Maybe[HashRef[AI::MXNet::NDArray]] :$arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]] :$aux_params=,
+    Bool                               :$allow_missing=0,
+    Bool                               :$force_init=0
+)
+{
+    return if($self->params_initialized and not $force_init);
+    assert($self->binded, 'call bind before initializing the parameters');
+    $self->_curr_module->init_params(
+        initializer   => $initializer,
+        arg_params    => $arg_params,
+        aux_params    => $aux_params,
+        allow_missing => $allow_missing,
+        force_init    => $force_init
+    );
+    $self->_params_dirty(0);
+    $self->params_initialized(1);
+}
+
+method get_states(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_curr_module->get_states($merge_multi_context);
+}
+
+method set_states(:$states=, :$value=)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_curr_module->set_states(states => $states, value => $value);
+}
+
+=head2 bind
+
+Binding for a AI::MXNet::Module::Bucketing means setting up the buckets and bind the
+executor for the default bucket key. Executors corresponding to other keys are
+binded afterwards with switch_bucket.
+
+Parameters
+----------
+:$data_shapes : ArrayRef[AI::MXNet::DataDesc|NameShape]
+    This should correspond to the symbol for the default bucket.
+:$label_shapes= : Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
+    This should correspond to the symbol for the default bucket.
+:$for_training : Bool
+    Default is 1.
+:$inputs_need_grad : Bool
+    Default is 0.
+:$force_rebind : Bool
+    Default is 0.
+:$shared_module : AI::MXNet::Module::Bucketing
+    Default is undef. This value is currently not used.
+:$grad_req : str, array ref of str, hash ref of str to str
+    Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
+    (defaults to 'write').
+    Can be specified globally (str) or for each argument (array ref, hash ref).
+$bucket_key : str
+    bucket key for binding. by default is to use the ->default_bucket_key
+=cut
+
+method bind(
+    ArrayRef[AI::MXNet::DataDesc|NameShape]                   :$data_shapes,
+    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]            :$label_shapes=,
+    Bool                                                      :$for_training=1,
+    Bool                                                      :$inputs_need_grad=0,
+    Bool                                                      :$force_rebind=0,
+    Maybe[AI::MXNet::BaseModule]                              :$shared_module=,
+    Str|ArrayRef[Str]|HashRef[Str]                            :$grad_req='write',
+    Maybe[Str]                                                :$bucket_key=
+)
+{
+    # in case we already initialized params, keep it
+    my ($arg_params, $aux_params);
+    if($self->params_initialized)
+    {
+        ($arg_params, $aux_params) = $self->get_params;
+    }
+
+    # force rebinding is typically used when one want to switch from
+    # training to prediction phase.
+    $self->_reset_bind if $force_rebind;
+
+    if($self->binded)
+    {
+        $self->logger->warning('Already binded, ignoring bind()');
+        return;
+    }
+
+    assert((not defined $shared_module), 'shared_module for BucketingModule is not supported');
+
+    $self->for_training($for_training);
+    $self->inputs_need_grad($inputs_need_grad);
+    $self->binded(1);
+
+    my ($symbol, $data_names, $label_names) = &{$self->_sym_gen}($bucket_key//$self->_default_bucket_key);
+    my $module = AI::MXNet::Module->new(
+            symbol            => $symbol,
+            data_names        => $data_names,
+            label_names       => $label_names,
+            logger            => $self->logger,
+            context           => $self->_context,
+            work_load_list    => $self->_work_load_list,
+            state_names       => $self->_state_names,
+            fixed_param_names => $self->_fixed_param_names
+    );
+    $module->bind(
+        data_shapes      => $data_shapes,
+        label_shapes     => $label_shapes,
+        for_training     => $for_training,
+        inputs_need_grad => $inputs_need_grad,
+        force_rebind     => 0,
+        shared_module    => undef,
+        grad_req         => $grad_req
+    );
+    $self->_curr_module($module);
+    $self->_curr_bucket_key($self->_default_bucket_key);
+    $self->_buckets->{ $self->_default_bucket_key } = $module;
+
+    # copy back saved params, if already initialized
+    if($self->params_initialized)
+    {
+        $self->set_params($arg_params, $aux_params);
+    }
+}
+
+=head2 switch_bucket
+
+Switch to a different bucket. This will change $self->_curr_module.
+
+Parameters
+----------
+:$bucket_key : str (or any perl object that overloads "" op)
+    The key of the target bucket.
+:$data_shapes :  Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
+    Typically $data_batch->provide_data.
+:$label_shapes : Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]
+    Typically $data_batch->provide_label.
+=cut
+
+method switch_bucket(
+    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]            :$data_shapes=,
+    Maybe[ArrayRef[AI::MXNet::DataDesc|NameShape]]            :$label_shapes=,
+                                                              :$bucket_key
+)
+{
+    assert($self->binded, 'call bind before switching bucket');
+    if(not exists $self->_buckets->{ $bucket_key })
+    {
+        my ($symbol, $data_names, $label_names) = &{$self->_sym_gen}($bucket_key);
+        my $module = AI::MXNet::Module->new(
+            symbol         => $symbol,
+            data_names     => $data_names,
+            label_names    => $label_names,
+            logger         => $self->logger,
+            context        => $self->_context,
+            work_load_list => $self->_work_load_list
+        );
+        $module->bind(
+            data_shapes      => $data_shapes,
+            label_shapes     => $label_shapes,
+            for_training     => $self->_curr_module->for_training,
+            inputs_need_grad => $self->_curr_module->inputs_need_grad,
+            force_rebind     => 0,
+            shared_module    => $self->_buckets->{ $self->_default_bucket_key },
+        );
+        $self->_buckets->{ $bucket_key } = $module;
+    }
+    $self->_curr_module($self->_buckets->{ $bucket_key });
+    $self->_curr_bucket_key($bucket_key);
+}
+
+method init_optimizer(
+    Str        :$kvstore='local',
+    Optimizer  :$optimizer='sgd',
+    HashRef    :$optimizer_params={ learning_rate => 0.01 },
+    Bool       :$force_init=0
+)
+{
+    assert($self->binded and $self->params_initialized);
+    if($self->optimizer_initialized and not $force_init)
+    {
+        $self->logger->warning('optimizer already initialized, ignoring.');
+        return;
+    }
+
+    $self->_curr_module->init_optimizer(
+        kvstore           => $kvstore,
+        optimizer         => $optimizer,
+        optimizer_params  => $optimizer_params,
+        force_init        => $force_init
+    );
+    for my $mod (values %{ $self->_buckets })
+    {
+        if($mod ne $self->_curr_module)
+        {
+            $mod->borrow_optimizer($self->_curr_module);
+        }
+    }
+    $self->optimizer_initialized(1);
+}
+
+method prepare(AI::MXNet::DataBatch $data_batch)
+{
+    assert($self->binded and $self->params_initialized);
+    ## perform bind if have not done so yet
+    my $original_bucket_key = $self->_curr_bucket_key;
+    $self->switch_bucket(
+        bucket_key   => $data_batch->bucket_key,
+        data_shapes  => $data_batch->provide_data,
+        label_shapes => $data_batch->provide_label
+    );
+    # switch back
+    $self->switch_bucket($original_bucket_key);
+}
+
+method forward(
+    AI::MXNet::DataBatch  $data_batch,
+    Bool                 :$is_train=
+)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->switch_bucket(
+        bucket_key   => $data_batch->bucket_key,
+        data_shapes  => $data_batch->provide_data,
+        label_shapes => $data_batch->provide_label
+    );
+    $self->_curr_module->forward($data_batch, is_train => $is_train);
+}
+
+method backward(Maybe[ArrayRef[AI::MXNet::NDArray]|AI::MXNet::NDArray] $out_grads=)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_curr_module->backward($out_grads);
+}
+
+method update()
+{
+    assert($self->binded and $self->params_initialized and $self->optimizer_initialized);
+    $self->_params_dirty(1);
+    $self->_curr_module->update;
+}
+
+method get_outputs(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized);
+    return $self->_curr_module->get_outputs($merge_multi_context);
+}
+
+method get_input_grads(Bool $merge_multi_context=1)
+{
+    assert($self->binded and $self->params_initialized and $self->inputs_need_grad);
+    return $self->_curr_module->get_input_grads($merge_multi_context);
+}
+
+method update_metric(
+    AI::MXNet::EvalMetric $eval_metric,
+    ArrayRef[AI::MXNet::NDArray] $labels
+)
+{
+    assert($self->binded and $self->params_initialized);
+    $self->_curr_module->update_metric($eval_metric, $labels);
+}
+
+method symbol()
+{
+    assert($self->binded);
+    return $self->_curr_module->symbol;
+}
+
+method get_symbol()
+{
+    assert($self->binded);
+    return $self->_buckets->{ $self->_default_bucket_key }->symbol;
+}
+
+method install_monitor(AI::MXNet::Monitor $mon)
+{
+    assert($self->binded);
+    for my $mod (values %{ $self->_buckets })
+    {
+        $mod->install_monitor($mon);
+    }
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
new file mode 100644
index 000000000000..7425b0ecb639
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
@@ -0,0 +1,187 @@
+package AI::MXNet::Monitor;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Base;
+
+=head1 NAME
+
+AI::MXNet::Monitor - Monitor outputs, weights, and gradients for debugging.
+
+=head1 DESCRIPTION
+
+Monitor outputs, weights, and gradients for debugging.
+
+Parameters
+----------
+interval : int
+    Number of batches between printing.
+stat_func : function
+    a function that computes statistics of tensors.
+    Takes a NDArray and returns a NDArray. defaults to mean
+    absolute value |x|/size(x).
+pattern : str
+    A regular expression specifying which tensors to monitor.
+    Only tensors with names that match name_pattern will be included.
+    For example, '.*weight|.*output' will print all weights and outputs;
+    '.*backward.*' will print all gradients.
+=cut
+
+has 'interval'  => (is => 'ro', isa => 'Int', required => 1);
+has 'stat_func' => (
+    is => 'ro',
+    isa => 'CodeRef',
+    default => sub {
+        return sub {
+            # returns |x|/size(x), async execution.
+            my ($x) = @_;
+            return $x->norm/sqrt($x->size);
+        }
+    },
+    lazy => 1
+);
+has 'pattern'             => (is => 'ro', isa => 'Str', default => '.*');
+has '_sort'               => (is => 'ro', isa => 'Bool', init_arg => 'sort', default => 0);
+has [qw/queue exes/]      => (is => 'rw', init_arg => undef, default => sub { [] });
+has [qw/step activated/]  => (is => 'rw', init_arg => undef, default => 0);
+has 're_pattern'          => (
+    is => 'ro',
+    init_arg => undef,
+    default => sub {
+        my $pattern = shift->pattern;
+        my $re = eval { qr/$pattern/ };
+        confess("pattern $pattern failed to compile as a regexp $@")
+            if $@;
+        return $re;
+    },
+    lazy => 1
+);
+has 'stat_helper'          => (
+    is => 'ro',
+    init_arg => undef,
+    default => sub {
+        my $self = shift;
+        return sub {
+            my ($name, $handle) = @_;
+            return if(not $self->activated or not $name =~ $self->re_pattern);
+            my $array = AI::MXNet::NDArray->new(handle => $handle, writable => 0);
+            push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
+        }
+    },
+    lazy => 1
+);
+
+=head2 install
+
+install callback to executor.
+Supports installing to multiple exes
+
+Parameters
+----------
+exe : AI::MXNet::Executor
+    the Executor (returned by $symbol->bind) to install to.
+=cut
+
+method install(AI::MXNet::Executor $exe)
+{
+    $exe->set_monitor_callback($self->stat_helper);
+    push @{ $self->exes }, $exe;
+}
+
+=head2 tic
+
+start collecting stats for current batch.
+Call before forward
+=cut
+
+method tic()
+{
+        if ($self->step % $self->interval == 0)
+        {
+            for my $exe (@{ $self->exes })
+            {
+                $_->wait_to_read for @{ $exe->arg_arrays };
+                $_->wait_to_read for @{ $exe->aux_arrays };
+            }
+            $self->queue([]);
+            $self->activated(1);
+        }
+        $self->step($self->step + 1);
+}
+
+=head2 toc
+
+End collecting for current batch and return results.
+Call after computation of current batch.
+
+Returns
+-------
+res : array ref of array refs with debug info
+=cut
+
+method toc()
+{
+    return [] unless $self->activated;
+    for my $exe (@{ $self->exes })
+    {
+        $_->wait_to_read for @{ $exe->arg_arrays };
+        $_->wait_to_read for @{ $exe->aux_arrays };
+    }
+    for my $exe (@{ $self->exes })
+    {
+        zip(sub {
+            my ($name, $array) = @_;
+            push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
+        }, $exe->_symbol->list_arguments, $exe->arg_arrays);
+        zip(sub {
+            my ($name, $array) = @_;
+            push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
+        }, $exe->_symbol->list_auxiliary_states, $exe->aux_arrays);
+    }
+    $self->activated(0);
+    my @res;
+    if($self->_sort)
+    {
+        @{ $self->queue } = sort { $a->[1] cmp $b->[1] } @{ $self->queue };
+    }
+    for my $q (@{ $self->queue })
+    {
+        my ($n, $k, $v_list) = @{ $q };
+        if(ref $v_list ne 'ARRAY')
+        {
+            $v_list = [$v_list];
+        }
+        my $s = '';
+        for my $v (@{ $v_list })
+        {
+            confess("the argument must be NDArray") 
+                unless blessed($v) and $v->isa('AI::MXNet::NDArray');
+            if($v->size == 1)
+            {
+                $s .= $v->asscalar . "\t";
+            }
+            else
+            {
+                $s .= $v->aspdl . "\t";
+            }
+        }
+        push @res, [$n, $k, $s];
+    }
+    $self->queue([]);
+    return \@res;
+}
+
+=head2 toc_print
+
+End collecting and print results
+=cut
+
+method toc_print()
+{
+    my $res = $self->toc;
+    for my $r (@{ $res })
+    {
+        AI::MXNet::Logging->info('Batch: %7d %30s %s', @{ $r });
+    }
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
new file mode 100644
index 000000000000..cd72c726a88f
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -0,0 +1,1246 @@
+package AI::MXNet::NDArray;
+
+=head1 NAME
+
+AI::MXNet::NDArray - Multidimensional tensor object of MXNet.
+=cut
+
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::NDArray::Slice;
+use AI::MXNet::Context;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+use overload 
+    '""' => \&stringify,
+    '+'  => \&add,
+    '+=' => \&iadd,
+    '-'  => \&subtract,
+    '-=' => \&isubtract,
+    '*'  => \&multiply,
+    '*=' => \&imultiply,
+    '/'  => \&divide,
+    '/=' => \&idivide,
+    '**' => \&power,
+    '==' => \&equal,
+    '!=' => \&not_equal,
+    '>'  => \&greater,
+    '>=' => \&greater_equal,
+    '<'  => \&lesser,
+    '<=' => \&lesser_equal,
+    '.=' => \&set,
+    '=' => sub { $_[0] };
+
+extends 'AI::MXNet::NDArray::Base';
+has 'writable' => (is => 'rw', isa => 'Int', default => 1, lazy => 1);
+has 'handle'   => (is => 'rw', isa => 'NDArrayHandle', required => 1);
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::NDArrayFree(shift->handle));
+}
+
+method STORABLE_freeze($cloning)
+{
+    my $buf = check_call(AI::MXNetCAPI::NDArraySaveRawBytes($self->handle));
+    return ($buf,\ $self->writable);
+}
+
+method STORABLE_thaw($cloning, $buf, $writable)
+{
+    my $handle = check_call(
+                    AI::MXNetCAPI::NDArrayLoadFromRawBytes(
+                        $buf, length($buf)
+                    )
+    );
+    $self->handle($handle);
+    $self->writable($$writable);
+}
+
+method at(Index @indices)
+{
+    confess("No idxs supplied") unless @indices;
+    my $shape = $self->shape;
+    my $dsize = @$shape;
+    my $isize = @indices;
+    confess("Dimensions size $dsize < indexes size $isize")
+        if $dsize < $isize;
+    confess("Dimensions size $dsize = indexes size $isize, 
+                   ndarray only supports either ->at on dimension 0
+                   or full crop")
+        if $isize > 1 and $dsize != $isize;
+    my $i = 0;
+    zip(sub {
+        my ($idx, $dim_size) = @_;
+        confess("Dimension $i mismatch Idx: $idx >= Dim Size: $dim_size")
+            if $idx >= $dim_size or ($idx + $dim_size) < 0;
+        ++$i;
+    }, \@indices, $shape);  
+    $i = 0;
+    for my $v (@indices)
+    {
+        $v += $shape->[$i] if $v < 0;
+        ++$i;
+    }
+    return $self->_at($indices[0]) if @indices == 1;
+    return $self->slice(@indices);
+}
+
+method slice(Slice @slices)
+{
+    confess("No slices supplied") unless @slices;
+    my $shape = $self->shape;
+    my $dsize = @$shape;
+    my $isize = @slices;
+    confess("Dimensions size $dsize < slices size $isize")
+        if $dsize < $isize;
+    confess("Dimensions size $dsize != slices size $isize,
+                   ndarray only supports either ->slice on dimension 0
+                   or full crop")
+        if $isize > 1 and $dsize != $isize;
+    my $i = -1;
+    @slices = map {
+        ++$i;
+        ref $_ ? (@$_ == 1 ? [$_->[0], $shape->[$i] - 1] : $_) : ($_ eq 'X' ? [0, $shape->[$i] - 1] : [$_, $_]);
+    } @slices;
+    zip(sub {
+        my ($slice, $dim_size) = @_;
+        my ($begin, $end, $stride) = @$slice;
+        confess("NDArray does not support slice strides != 1")
+            if ($stride//0) > 1;
+        confess("Dimension $i mismatch slice begin : $begin >= Dim Size: $dim_size")
+            if $begin >= $dim_size or ($begin + $dim_size) < 0;
+        confess("Dimension $i mismatch slice end : $end >= Dim Size: $dim_size")
+            if $end >= $dim_size or ($end + $dim_size) < 0;
+    }, \@slices, $shape);
+    $i = 0;
+    my ($begin, $end) = ([], []);
+    for my $s (@slices)
+    {
+        $s->[0] += $shape->[$i] if $s->[0] < 0;
+        $s->[1] += $shape->[$i] if $s->[1] < 0;
+        confess("Dimension $i slice mismatch (begin $s->[0] > end $s->[1])")
+            if($s->[0] > $s->[1]);
+        push @$begin, $s->[0];
+        push @$end, $s->[1] + 1;
+        $i++;
+    }
+    return $self->_slice($begin->[0], $end->[0]) if @slices == 1;
+    return AI::MXNet::NDArray::Slice->new(parent => $self, begin => $begin, end => $end);
+}
+
+method set(AcceptableInput $value, $reverse=)
+{
+    confess("set value must be defined") unless defined $value;
+    confess("Array is not writable") if not $self->writable;
+    ## plain number
+    if(not ref $value)
+    {
+        $self->_set_value($value, { out => $self });
+    }
+    # ndarray
+    elsif(blessed($value) and $value->isa(__PACKAGE__))
+    {
+        $value->copyto($self);
+    }
+    # slice of another ndarray
+    elsif(blessed($value) and $value->isa('AI::MXNet::NDArray::Slice'))
+    {
+        $value->sever->copyto($self);
+    }
+    # perl array, PDL, PDL::Matrix
+    else
+    {
+        $self->_sync_copyfrom($value);
+    }
+    return $self;
+}
+
+method asscalar()
+{
+    confess("ndarray size must be 1") unless $self->size == 1;
+    return $self->aspdl->at(0);
+}
+
+method _sync_copyfrom(ArrayRef|PDL|PDL::Matrix $source_array)
+{
+    my $dtype = $self->dtype;
+    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
+    if(not blessed($source_array))
+    {
+        $source_array = eval {
+            pdl($pdl_type, $source_array);
+        };
+        confess($@) if $@;
+    }
+    if($pdl_type->numval != $source_array->type->numval)
+    {
+        my $convert_func = $pdl_type->convertfunc;
+        $source_array = $source_array->$convert_func;
+    }
+    $source_array = pdl($pdl_type, [@{ $source_array->unpdl } ? $source_array->unpdl->[0] : 0 ]) 
+        unless @{ $source_array->shape->unpdl };
+    my $pdl_shape = $source_array->shape->unpdl;
+    my $pdl_shape_str = join(',', ref($source_array) eq 'PDL' ? reverse @{ $pdl_shape } : @{ $pdl_shape });
+    my $ndary_shape_str = join(',', @{ $self->shape });
+    if($pdl_shape_str ne $ndary_shape_str)
+    {
+        confess("Shape inconsistant: expected $ndary_shape_str vs got $pdl_shape_str")
+    }
+    my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
+    my $buf;
+    ## special handling for float16
+    if($perl_pack_type eq 'S')
+    {
+        $buf = pack("S*", map { AI::MXNetCAPI::_float_to_half($_) } unpack ("f*", ${$source_array->get_dataref}));
+    }
+    else
+    {
+        $buf = ${$source_array->get_dataref};
+    }
+    check_call(AI::MXNetCAPI::NDArraySyncCopyFromCPU($self->handle, $buf, $self->size));
+    return $self;
+}
+
+=head2 aspdl
+
+Returns a copied PDL array of current array.
+
+Returns
+-------
+array : PDL
+A copy of the array content.
+=cut
+
+method aspdl()
+{
+    my $dtype = $self->dtype;
+    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
+    my $pdl = PDL->new_from_specification($pdl_type, reverse @{ $self->shape });
+    my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
+    my $buf = pack("$perl_pack_type*", (0)x$self->size);
+    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $buf, $self->size)); 
+    ## special handling for float16
+    if($perl_pack_type eq 'S')
+    {
+        $buf = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $buf));
+    }
+    ${$pdl->get_dataref} = $buf;
+    $pdl->upd_data;
+    return $pdl;
+}
+
+
+=head2 asmpdl
+
+Returns copied PDL::Matrix objectt of current array.
+
+Requires caller to "use PDL::Matrix" in user space.
+Returns
+-------
+array : PDL::Matrix
+A copy of array content.
+=cut
+
+method asmpdl()
+{
+    my $dtype = $self->dtype;
+    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
+    my $pdl = PDL::Matrix->new_from_specification($pdl_type, @{ $self->shape });
+    my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
+    my $buf = pack("$perl_pack_type*", (0)x$self->size);
+    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $buf, $self->size)); 
+    ## special handling for float16
+    if($perl_pack_type eq 'S')
+    {
+        $buf = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $buf));
+    }
+    ${$pdl->get_dataref} = $buf;
+    $pdl->upd_data;
+    return $pdl;
+}
+
+
+=head2 _slice
+
+Returns sliced NDArray that shares memory with the current one.
+
+Parameters
+----------
+start : int
+    Starting index of slice.
+stop : int
+    Finishing index of slice.
+=cut
+
+method  _slice (
+    Index $start,
+    Index $stop
+)
+{
+    confess("start $start > stop $stop") if $start > $stop;
+    my $handle = check_call(
+        AI::MXNetCAPI::NDArraySlice(
+            $self->handle,
+            $start,
+            $stop
+        )
+    );
+    return __PACKAGE__->new(handle => $handle, writable => $self->writable);
+}
+
+=head2  _at
+
+Returns a sub NDArray that shares memory with current one.
+
+Parameters
+----------
+idx : int
+    index of the sub array.
+=cut
+
+
+method _at(Index $idx)
+{
+    my $handle = check_call(
+                AI::MXNetCAPI::NDArrayAt(
+                    $self->handle, $idx >=0 ? $idx : $self->shape->[0] + $idx
+                )
+    );
+    return __PACKAGE__->new(handle => $handle, writable => $self->writable);
+}
+
+=head2 reshape
+
+Returns a reshaped NDArray that shares the memory with current one.
+
+Parameters
+----------
+new_shape : Shape
+    new shape of NDArray
+=cut
+
+method reshape(Shape $new_shape)
+{
+    my $handle = check_call(
+                    AI::MXNetCAPI::NDArrayReshape(
+                        $self->handle,
+                        scalar(@$new_shape),
+                        $new_shape
+                    )
+    );
+    return __PACKAGE__->new(handle => $handle, writable => $self->writable);
+}
+
+
+=head broadcast_to
+
+Broadcasting the current NDArray into the given shape. 
+
+Parameters
+---------
+Shape $shape : the shape to broadcast
+=cut
+
+method broadcast_to(Shape $shape)
+{
+    my $cur_shape = $self->shape;
+    my $err_str = "operands could not be broadcast together with remapped shapes" 
+                  ."[original->remapped]: [@$cur_shape] and requested shape [@$shape]";
+    if(@$shape < @$cur_shape)
+    {
+        confess($err_str);
+    }
+    @$cur_shape = ((1)x(@$shape - @$cur_shape), @$cur_shape);
+    my $cur_shape_arr = pdl($cur_shape);
+    my $broadcasting_axes = ($cur_shape_arr != pdl($shape))->which->unpdl;
+    if (grep { $cur_shape->[$_] != 1 } @$broadcasting_axes)
+    {
+        confess($err_str);
+    }
+    if(join(',',@$cur_shape) ne join(',',@{ $self->shape }))
+    {
+        return __PACKAGE__->SUPER::broadcast_to($self->reshape($cur_shape),{ shape => $shape });
+    }    
+    else
+    {
+        return __PACKAGE__->SUPER::broadcast_to($self, { shape => $shape });
+    }
+}
+
+=head2 wait_to_read
+
+Block until all pending write operations on the NDArray are finished.
+
+This function will return when all the pending writes to the current
+NDArray are finished. There can be pending reads going on when the
+function returns.
+=cut
+
+method wait_to_read()
+{
+    check_call(AI::MXNetCAPI::NDArrayWaitToRead($self->handle));
+}
+
+=head2 shape
+
+Get the shape of current NDArray.
+
+Returns
+-------
+an array ref representing the shape of current ndarray
+=cut
+
+method shape()
+{
+    return scalar(check_call(AI::MXNetCAPI::NDArrayGetShape($self->handle)));
+}
+
+=head2 size
+
+Number of elements in the array.
+
+=cut
+
+method size(Shape|Undef $shape=)
+{
+    my $size = 1;
+    map { $size *= $_ } @{ $shape//$self->shape };
+    return $size;
+}
+
+
+=head2 context
+
+The context of the NDArray.
+
+Returns
+-------
+$context : AI::MXNet::Context
+=cut
+
+method context()
+{
+    my ($dev_type_id, $dev_id) = check_call(
+        AI::MXNetCAPI::NDArrayGetContext($self->handle)
+    );
+    return AI::MXNet::Context->new(
+        device_type => AI::MXNet::Context::devtype2str->{ $dev_type_id },
+        device_id => $dev_id
+    );
+}
+
+=head2 dtype
+
+The data type of current NDArray.
+
+Returns
+-------
+a data type string ('float32', 'float64', 'float16', 'uint8', 'int32') 
+representing the data type of the ndarray.
+'float32' is the default dtype for the ndarray class.
+=cut
+
+method dtype()
+{
+    my $dtype = check_call(
+        AI::MXNetCAPI::NDArrayGetDType(
+            $self->handle
+        )
+    );
+    return DTYPE_MX_TO_STR->{ $dtype };
+}
+
+=head2 copyto
+
+Copy the content of current array to another entity.
+
+When another entity is the NDArray, the content is copied over.
+When another entity is AI::MXNet::Context, a new NDArray in the context
+will be created.
+
+Parameters
+----------
+other : NDArray or Context
+    Target NDArray or context we want to copy data to.
+
+Returns
+-------
+dst : NDArray
+=cut
+
+method copyto(AI::MXNet::Context|AI::MXNet::NDArray $other)
+{
+    if(blessed($other) and $other->isa('AI::MXNet::Context'))
+    {
+        my $hret = __PACKAGE__->empty(
+            $self->shape,
+            ctx => $other,
+            dtype => $self->dtype
+        );
+        return __PACKAGE__->_copyto($self, { out => $hret });
+    }
+    else
+    {
+        if ($other->handle eq $self->handle)
+        {
+            Carp::cluck('copy an array to itself, is it intended?');
+        }
+        return __PACKAGE__->_copyto($self, { out => $other });
+    }
+}
+
+=head2 copy
+
+Makes a copy of the current ndarray in the same context
+
+Returns
+------
+$copy : NDArray
+=cut
+
+method copy()
+{
+    return $self->copyto($self->context);
+}
+
+## alias for PDL::NiceSlice
+*sever = \&copy;
+
+=head2 T
+
+Get transpose of the NDArray.
+Works only on 2-D matrices.
+=cut
+
+method T()
+{
+    if (@{$self->shape} != 2)
+    {
+        confess('Only 2D matrix is allowed to be transposed');
+    }
+    return __PACKAGE__->transpose($self);
+}
+
+=head2 astype
+
+Returns copied ndarray of current array with the specified type.
+
+Parameters
+----------
+$dtype : Dtype
+
+Returns
+-------
+$array : ndarray
+A copy of the array content.
+=cut
+
+method astype(Dtype $dtype)
+{
+    my $res = __PACKAGE__->empty($self->shape, ctx => $self->context, dtype => $dtype);
+    $self->copyto($res);
+    return $res;
+}
+
+=head2 as_in_context
+
+Returns an NDArray in the target context.
+If the array is already in that context, self is returned. Otherwise, a copy is
+made.
+
+Parameters
+----------
+context : AI::MXNet::Context
+The target context we want the return value to live in.
+
+Returns
+-------
+A copy or self as an NDArray in the target context.
+=cut
+
+method as_in_context(AI::MXNet::Context $context)
+{
+    return $self if $self->context == $context;
+    return $self->copyto($context);
+}
+
+=head onehot_encode
+
+One hot encoding indices into matrix out.
+
+Parameters
+----------
+indices: NDArray
+    An NDArray containing indices of the categorical features.
+
+out: NDArray
+    The result of the encoding.
+
+Returns
+-------
+$out: NDArray
+=cut
+
+method onehot_encode(AI::MXNet::NDArray $indices, AI::MXNet::NDArray $out)
+{
+    return __PACKAGE__->_onehot_encode($indices, $out, { out => $out });
+}
+
+=head2 _ufunc_helper(lhs, rhs, fn_array, lfn_scalar, rfn_scalar):
+
+    Helper function for element-wise operation
+    The function will perform numpy-like broadcasting if needed and call different functions
+
+    Parameters
+    ----------
+    lhs : NDArray or numeric value
+        left hand side operand
+
+    rhs : NDArray or numeric value
+        right hand side operand
+
+    fn_array : function
+        function to be called if both lhs and rhs are of NDArray type
+
+    lfn_scalar : function
+        function to be called if lhs is NDArray while rhs is numeric value
+
+    rfn_scalar : function
+        function to be called if lhs is numeric value while rhs is NDArray;
+        if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
+
+    Returns
+    -------
+    out: NDArray
+        result array
+=cut
+
+sub  _ufunc_helper
+{
+    my ($lhs, $rhs, $fn_array, $lfn_scalar, $rfn_scalar, $reverse) = @_;
+    ($rhs, $lhs) = ($lhs, $rhs) if $reverse and $rfn_scalar;
+    if(not ref $lhs)
+    {
+        if(not $rfn_scalar)
+        {
+            return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $rhs, $lhs);
+        }
+        else
+        {
+            return __PACKAGE__->can($rfn_scalar)->(__PACKAGE__, $rhs, $lhs);
+        }
+    }
+    elsif(not ref $rhs)
+    {
+        return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $lhs, $rhs);
+    }
+    else
+    {
+        return __PACKAGE__->can($fn_array)->(__PACKAGE__, $lhs, $rhs);
+    }
+}
+
+method stringify($other=, $reverse=)
+{
+    sprintf("<%s %s @%s>", ref($self), join('x', @{ $self->shape }), $self->context);
+}
+
+method iadd(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    confess('trying to add to a readonly NDArray') unless $self->writable;
+    return ref $other 
+        ? __PACKAGE__->broadcast_add($self, $other, { out => $self })
+        : __PACKAGE__->_plus_scalar($self, $other, { out => $self })
+}
+
+method add(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_add _plus_scalar/
+    );
+}
+
+
+method subtract(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_sub _minus_scalar _rminus_scalar/,
+        $reverse
+    );
+}
+
+method isubtract(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    confess('trying to add to a readonly NDArray') unless $self->writable;
+    return ref $other
+        ? __PACKAGE__->broadcast_sub($self, $other, { out => $self })
+        : __PACKAGE__->_minus_scalar($self, $other, { out => $self })
+}
+
+method multiply(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_mul _mul_scalar/
+    );
+}
+
+method imultiply(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    confess('trying to add to a readonly NDArray') unless $self->writable;
+    return ref $other 
+        ? __PACKAGE__->broadcast_mul($self, $other, { out => $self }) 
+        : __PACKAGE__->_mul_scalar($self, $other, { out => $self }) 
+}
+
+method divide(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_div _div_scalar _rdiv_scalar/,
+        $reverse
+    );
+}
+
+method idivide(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    confess('trying to add to a readonly NDArray') unless $self->writable;
+    return ref $other 
+        ? __PACKAGE__->broadcast_div($self, $other, { out => $self }) 
+        : __PACKAGE__->_div_scalar($self, $other, { out => $self }) 
+}
+
+method power(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_power _power_scalar _rpower_scalar/,
+        $reverse
+    );
+}
+
+method maximum(AI::MXNet::NDArray|Num $other)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_maximum _maximum_scalar/
+    );
+}
+
+method minimum(AI::MXNet::NDArray|Num $other)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_minimum _minimum_scalar/
+    );
+}
+
+method equal(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_equal _equal_scalar/
+    );
+}
+
+method not_equal(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_not_equal _not_equal_scalar/
+    );
+}
+
+method greater(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_greater _greater_scalar _lesser_scalar/,
+        $reverse
+    );
+}
+
+method greater_equal(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_greater_equal _greater_equal_scalar _lesser_equal_scalar/,
+        $reverse
+    );
+}
+
+method lesser(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_lesser _lesser_scalar _greater_scalar/,
+        $reverse
+    );
+}
+
+method lesser_equal(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/broadcast_lesser_equal _lesser_equal_scalar _greater_equal_scalar/,
+        $reverse
+    );
+}
+
+method true_divide(AI::MXNet::NDArray|Num $other, $reverse=)
+{
+    return $self->divide($other, $reverse);
+}
+
+=head2 empty(
+
+Creates an empty uninitialized NDArray, with the specified shape.
+
+Parameters
+----------
+shape : Shape
+    shape of the NDArray.
+
+ctx : AI::MXNet::Context, optional
+The context of the NDArray, defaults to current default context.
+
+Returns
+-------
+out: Array
+    The created NDArray.
+=cut
+
+method empty(Shape $shape, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
+{
+    return __PACKAGE__->new(
+                handle => _new_alloc_handle(
+                    $shape,
+                    $ctx,
+                    0,
+                    DTYPE_STR_TO_MX->{$dtype}
+                )
+    );
+}
+
+=head2 zeros
+
+Creates a new NDArray filled with 0, with specified shape.
+
+Parameters
+----------
+shape : Shape
+    The shape of the NDArray.
+ctx : AI::MXNet::Context, optional.
+    The context of the NDArray, defaults to current default context.
+
+Returns
+-------
+out: Array
+    The created NDArray.
+=cut
+
+method zeros(Shape $shape, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
+{
+    return __PACKAGE__->_zeros({ shape => $shape, ctx => "$ctx", dtype => $dtype });
+}
+
+=head2 ones
+
+Creates a new NDArray filled with 1, with specified shape.
+
+Parameters
+----------
+shape : Shape
+    The shape of the NDArray.
+ctx : Context, optional.
+    The context of the NDArray, default to current default context.
+
+Returns
+-------
+out: Array
+    The created NDArray.
+=cut
+
+method ones(Shape $shape, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
+{
+    return __PACKAGE__->_ones({ shape => $shape, ctx => "$ctx", dtype => $dtype });
+}
+
+=head2 full
+
+Creates a new NDArray filled with given value, with specified shape.
+
+Parameters
+----------
+shape : Shape
+    The shape of the NDArray.
+val : float or int
+    The value to be filled with.
+ctx : Context, optional.
+    The context of the NDArray, default to current default context.
+
+Returns
+-------
+out: NDArray
+    The created NDArray.
+=cut
+
+method full(Shape $shape, Num $val, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
+{
+    return __PACKAGE__->_set_value({ src => $val, out => __PACKAGE__->empty($shape, ctx => $ctx, dtype => $dtype) });
+}
+
+=head2 array
+
+Creates a new NDArray that is a copy of the source_array.
+
+Parameters
+----------
+source_array : PDL, PDL::Matrix, Array ref in PDL::pdl format
+        Source data to create NDArray from.
+
+ctx : Context, optional
+The context of the NDArray, default to current default context.
+
+Returns
+-------
+out: Array
+    The created NDArray.
+=cut
+
+method array(PDL|PDL::Matrix|ArrayRef $source_array, AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
+{
+    my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
+    if(not blessed($source_array))
+    {
+        $source_array = eval {
+            pdl($pdl_type, $source_array);
+        };
+        confess($@) if $@;
+    }
+    $source_array = pdl($pdl_type, [@{ $source_array->unpdl } ? $source_array->unpdl->[0] : 0 ]) unless @{ $source_array->shape->unpdl };
+    my $shape = $source_array->shape->unpdl;
+    my $arr = __PACKAGE__->empty([ref($source_array) eq 'PDL' ? reverse @{ $shape } : @{ $shape }], ctx => $ctx, dtype => $dtype );
+    $arr .= $source_array;
+    return $arr;
+}
+
+
+=head2 concatenate
+
+Concatenates an array ref of NDArrays along the first dimension.
+
+Parameters
+----------
+arrays :  array ref of NDArrays
+    Arrays to be concatenate. They must have identical shape except
+    for the first dimension. They also must have the same data type.
+axis : int
+    The axis along which to concatenate.
+always_copy : bool
+    Default 1. When not 1, if the arrays only contain one
+    NDArray, that element will be returned directly, avoid copying.
+
+Returns
+-------
+An NDArray in the same context as $arrays->[0]->context.
+=cut
+
+method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$always_copy=1)
+{
+    confess("no arrays provided") unless @$arrays > 0;
+    if(not $always_copy and @$arrays == 1)
+    {
+        return $arrays->[0];
+    }
+    my $shape_axis = $arrays->[0]->shape->[$axis];
+    my $shape_rest1 = [@{ $arrays->[0]->shape }[0..($axis-1)]];
+    my $shape_rest2 = [@{ $arrays->[0]->shape }[($axis+1)..(@{ $arrays->[0]->shape }-1)]];
+    my $dtype = $arrays->[0]->dtype;
+    my $i = 1;
+    for my $arr (@{ $arrays }[1..(@{ $arrays }-1)])
+    {
+        $shape_axis += $arr->shape->[$axis];
+        my $arr_shape_rest1 = [@{ $arr->shape }[0..($axis-1)]];
+        my $arr_shape_rest2 = [@{ $arr->shape }[($axis+1)..(@{ $arr->shape }-1)]];
+        confess("first array $arrays->[0] and $i array $arr do not match") 
+            unless  join(',',@$arr_shape_rest1) eq join(',',@$shape_rest1);
+        confess("first array $arrays->[0] and $i array $arr do not match") 
+            unless  join(',',@$arr_shape_rest2) eq join(',',@$shape_rest2);
+        confess("first array $arrays->[0] and $i array $arr dtypes do not match") 
+            unless  join(',',@$arr_shape_rest2) eq join(',',@$shape_rest2);
+        $i++;
+    }
+    my $ret_shape = [@$shape_rest1, $shape_axis, @$shape_rest2];
+    my $ret = __PACKAGE__->empty($ret_shape, ctx => $arrays->[0]->context, dtype => $dtype);
+    my $idx = 0;
+    my $begin = [(0)x@$ret_shape];
+    my $end = [@$ret_shape];
+    for my $arr (@$arrays)
+    {
+        if ($axis == 0)
+        {
+            $ret->slice([$idx,($idx+$arr->shape->[0]-1)]) .= $arr;
+        }
+        else
+        {
+            $begin->[$axis] = $idx;
+            $end->[$axis] = $idx+$arr->shape->[$axis];
+            __PACKAGE__->_crop_assign(
+                $ret, $arr, 
+                { 
+                    out => $ret,
+                    begin => $begin,
+                    end => $end
+                }
+            );
+        }
+        $idx += $arr->shape->[$axis];
+    }
+    return $ret
+}
+
+=head2 arange
+
+Similar function in the MXNet ndarray as numpy.arange
+See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html.
+
+Parameters
+----------
+start : number, optional
+    Start of interval. The interval includes this value. The default start value is 0.
+stop : number, optional
+    End of interval. The interval does not include this value.
+step : number, optional
+    Spacing between the values
+repeat : number, optional
+    The repeating time of all elements.
+    E.g repeat=3, the element a will be repeated three times --> a, a, a.
+ctx : Context, optional
+    The context of the NDArray, defaultw to current default context.
+dtype : data type, optional
+    The value type of the NDArray, defaults to float32
+
+Returns
+-------
+out : NDArray
+    The created NDArray
+=cut
+
+method arange(Index :$start=0, Index :$stop=, Index :$step=1, Index :$repeat=1,
+              AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
+{
+    return __PACKAGE__->_arange({
+                start => $start,
+                (defined $stop ? (stop => $stop) : ()),
+                step => $step,
+                repeat => $repeat,
+                dtype => $dtype,
+                ctx => "$ctx"
+    });
+}
+
+=head2 load
+
+Loads ndarray from a binary file.
+
+You can also use Storable to do the job if you only work on perl.
+The advantage of load/save is the file is language agnostic.
+This means the file saved using save can be loaded by other language binding of mxnet.
+You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
+
+Parameters
+----------
+fname : str
+    The name of the file.Can be S3 or HDFS address (remember built with S3 support).
+    Example of fname:
+
+    - `s3://my-bucket/path/my-s3-ndarray`
+    - `hdfs://my-bucket/path/my-hdfs-ndarray`
+    - `/path-to/my-local-ndarray`
+
+Returns
+-------
+out : array ref of NDArrays or hash ref with NDArrays
+=cut
+
+method load(Str $filename)
+{
+    my ($handles, $names) = check_call(AI::MXNetCAPI::NDArrayLoad($filename));
+    if (not @$names)
+    {
+        return [map { __PACKAGE__->new(handle => $_) } @$handles];
+    }
+    else
+    {
+        my $n = @$names;
+        my $h = @$handles;
+        confess("Handles [$h] and names [$n] count mismatch") unless $h == $n;
+        my %ret;
+        @ret{ @$names } = map { __PACKAGE__->new(handle => $_) } @$handles;
+        return \%ret;
+    }
+}
+
+=head2 save
+
+Save array ref of NDArray or hash of str->NDArray to a binary file.
+
+You can also use Storable to do the job if you only work on perl.
+The advantage of load/save is the file is language agnostic.
+This means the file saved using save can be loaded by other language binding of mxnet.
+You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
+
+Parameters
+----------
+fname : str
+    The name of the file.Can be S3 or HDFS address (remember built with S3 support).
+    Example of fname:
+
+    - `s3://my-bucket/path/my-s3-ndarray`
+    - `hdfs://my-bucket/path/my-hdfs-ndarray`
+    - `/path-to/my-local-ndarray`
+
+data : array ref of NDArrays hash ref of NDArrays
+    The data to be saved.
+=cut
+
+method save(Str $filename, ArrayRef[AI::MXNet::NDArray]|HashRef[AI::MXNet::NDArray] $data)
+{
+    my $handles = [];
+    my $names = [];
+    if(ref $data eq 'HASH')
+    {
+        for my $name (keys %$data)
+        {
+            push @$names, $name;
+            push @$handles, $data->{ $name }->handle;
+        }
+    }
+    else
+    {
+        @$handles = map { $_->handle } @$data;
+    }
+    check_call(
+        AI::MXNetCAPI::NDArraySave(
+            $filename,
+            scalar(@$handles),
+            $handles,
+            $names
+        )
+    );
+}
+
+=head2 imdecode
+
+Decode an image from string. Requires OpenCV to work.
+
+Parameters
+----------
+str_img : str
+    binary image data
+clip_rect : iterable of 4 int
+    clip decoded image to rectangle (x0, y0, x1, y1)
+out : NDArray
+    output buffer. can be 3 dimensional (c, h, w) or 4 dimensional (n, c, h, w)
+index : int
+    output decoded image to i-th slice of 4 dimensional buffer
+channels : int
+    number of channels to output. Decode to grey scale when channels = 1.
+mean : NDArray
+    subtract mean from decode image before outputting.
+=cut
+
+method imdecode($str_img, ArrayRef[Int] :$clip_rect=[0, 0, 0, 0],
+                AI::MXNet::NDArray :$out=, Int :$index=0, Int :$channels=3, AI::MXNet::NDArray :$mean=)
+{
+    return __PACKAGE__->_imdecode(
+        $mean//__PACKAGE__->_new_empty_handle(),
+        $index,
+        @$clip_rect,
+        $channels,
+        length($str_img),
+        { str_img => $str_img, ($out ? (out => $out) : ()) }
+    );
+}
+
+=head2 _new_empty_handle
+
+Returns a new empty handle.
+
+Empty handle can be used to hold result
+
+Returns
+-------
+    a new empty ndarray handle
+=cut
+
+sub _new_empty_handle
+{
+    my $hdl = check_call(AI::MXNetCAPI::NDArrayCreateNone());
+    return $hdl;
+}
+
+=head2 _new_alloc_handle
+
+Returns a new handle with specified shape and context.
+
+Empty handle is only used to hold results
+
+Returns
+-------
+a new empty ndarray handle
+=cut
+
+func _new_alloc_handle($shape, $ctx, $delay_alloc, $dtype)
+{
+    my $hdl = check_call(AI::MXNetCAPI::NDArrayCreateEx(
+        $shape,
+        scalar(@$shape),
+        $ctx->device_type_id,
+        $ctx->device_id,
+        $delay_alloc,
+        $dtype)
+    );
+    return $hdl;
+}
+
+=head2 waitall
+
+Wait for all async operations to finish in MXNet.
+This function is used for benchmarks only.
+=cut
+
+method waitall()
+{
+    check_call(AI::MXNetCAPI::NDArrayWaitAll());
+}
+
+my $lvalue_methods = join "\n", map {"use attributes 'AI::MXNet::NDArray', \\&AI::MXNet::NDArray::$_, 'lvalue';"}
+qw/at slice aspdl asmpdl reshape copy sever T astype as_in_context copyto empty zero ones full
+                       array/;
+eval << "EOV" if ($^V and $^V >= 5.006007);
+{
+  no warnings qw(misc);
+  $lvalue_methods
+}
+EOV
+
+__PACKAGE__->meta->make_immutable;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
new file mode 100644
index 000000000000..b3e641bdf527
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
@@ -0,0 +1,145 @@
+package AI::MXNet::NDArray::Base;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::NDArray::Doc;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::NDArray::Base
+=cut
+
+=head1 DESCRIPTION
+
+This module provides a convenient interface to a C++ functions
+that work with NDArray.
+Essentially it loads them up during the lib startup into the Perl space.
+=cut
+
+my %function_meta;
+method function_meta($code)
+{
+    return $function_meta{$code};
+}
+
+method function_meta_hash()
+{
+    return \%function_meta;
+}
+
+func _make_ndarray_function($handle, $func_name)
+{
+    my ($real_name, $desc, $arg_names,
+        $arg_types, $arg_descs, $key_var_num_args,
+        $ret_type) = @{ check_call(AI::MXNetCAPI::SymbolGetAtomicSymbolInfo($handle)) };
+    $ret_type //= '';
+    my $doc_str = build_doc($func_name,
+                            $desc,
+                            $arg_names,
+                            $arg_types,
+                            $arg_descs,
+                            $key_var_num_args,
+                            $ret_type
+    );
+    my @arguments;
+    for my $i (0..(@$arg_names-1))
+    {
+        if(not $arg_types->[$i] =~ /^(?:NDArray|Symbol|ndarray\-or\-symbol)/)
+        {
+            push @arguments, $arg_names->[$i];
+        }
+    }
+    my $generic_ndarray_function = sub
+    {
+        my $class = shift;
+        my (@args, %kwargs);
+        if(@_ and ref $_[-1] eq 'HASH')
+        {
+            %kwargs = %{ pop(@_) };
+        }
+        @args = @_;
+        if(ref $class)
+        {
+            @args = ($class) if not @args;
+            $class = ref $class;
+        }
+        my @ndargs;
+        my @pos_args;
+        for my $i (@args)
+        {
+            if(blessed($i) and $i->isa($class))
+            {
+                push @ndargs, $i->handle;
+            }
+            else
+            {
+                push @pos_args, $i;
+            }
+            if(@pos_args > @arguments)
+            {
+                die "Too many positional arguments";
+            }
+        }
+        @kwargs{ @arguments[0..$#pos_args] } = @pos_args;
+        my $original_output;
+        my $output_vars;
+        if(grep { $_ eq 'out' } keys %kwargs)
+        {
+            $output_vars = delete $kwargs{out};
+            $original_output = $output_vars;
+            unless(ref($output_vars) and ref($output_vars) eq 'ARRAY')
+            {
+                $output_vars = [$output_vars];
+            }
+        }
+        else
+        {
+            $output_vars = [];
+        }
+        for my $key (keys %kwargs)
+        {
+            $kwargs{ $key } = "(" .join(", ", @{ $kwargs{ $key } }) .")" 
+                if ref $kwargs{ $key } eq 'ARRAY';
+        }
+        my $out = check_call(AI::MXNetCAPI::ImperativeInvoke(
+                    $handle,
+                    scalar(@ndargs),
+                    \@ndargs,
+                    [map { $_->handle } @$output_vars],
+                    scalar(keys %kwargs),
+                    \%kwargs)
+        );
+        return $original_output if $original_output;
+        if(@$out == 1)
+        {
+            return $class->new(handle => $out->[0]);
+        }
+        else
+        {
+            return [map { $class->new(handle => $_) } @$out];
+        }
+    };
+    $function_meta{ $generic_ndarray_function }{__name__} = $func_name;
+    $function_meta{ $generic_ndarray_function }{__doc__} = $doc_str;
+    return $generic_ndarray_function;
+}
+
+method _init_ndarray_module()
+{
+    my $op_names = check_call(AI::MXNetCAPI::ListAllOpNames());
+    for my $name (@$op_names)
+    {
+        my $handle = check_call(AI::NNVMCAPI::GetOpHandle($name));
+        my $function = _make_ndarray_function($handle, $name);
+        {
+            no strict 'refs';
+            *{__PACKAGE__."::$name"} = $function;
+        }
+    }
+}
+
+__PACKAGE__->_init_ndarray_module;
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Doc.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Doc.pm
new file mode 100644
index 000000000000..a1a7812ca9a5
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Doc.pm
@@ -0,0 +1,39 @@
+package AI::MXNet::NDArray::Doc;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use Exporter;
+use base qw(Exporter);
+@AI::MXNet::NDArray::Doc::EXPORT = qw(build_doc);
+
+=head2
+
+    Build docstring for imperative functions.
+=cut
+
+sub build_doc
+{
+    my ($func_name,
+        $desc,
+        $arg_names,
+        $arg_types,
+        $arg_desc,
+        $key_var_num_args,
+        $ret_type) = @_;
+    my $param_str = build_param_doc($arg_names, $arg_types, $arg_desc);
+    if($key_var_num_args)
+    {
+        $desc .= "\nThis function support variable length of positional input."
+    }
+    my $doc_str = sprintf("%s\n\n" .
+               "%s\n" .
+               "out : NDArray, optional\n" .
+               "    The output NDArray to hold the result.\n\n".
+               "Returns\n" .
+               "-------\n" .
+               "out : NDArray or list of NDArray\n" .
+               "    The output of this function.", $desc, $param_str);
+    return $doc_str
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
new file mode 100644
index 000000000000..bc4315b323bb
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
@@ -0,0 +1,102 @@
+package AI::MXNet::NDArray::Slice;
+use strict;
+use warnings;
+use Mouse;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::NDArray::Slice - A convenience class for slicing of the AI::MXNet::NDArray objects.
+=cut
+
+has parent => (is => 'ro', isa => 'AI::MXNet::NDArray', required => 1);
+has begin  => (is => 'ro', isa => 'Shape', required => 1);
+has end    => (is => 'ro', isa => 'Shape', required => 1);
+use overload 
+    '.=' => \&set,
+    '='  => sub { $_[0] },
+    '""' => \&notsupported,
+    '+'  => \&notsupported,
+    '+=' => \&notsupported,
+    '-'  => \&notsupported,
+    '-=' => \&notsupported,
+    '*'  => \&notsupported,
+    '*=' => \&notsupported,
+    '/'  => \&notsupported,
+    '/=' => \&notsupported,
+    '**' => \&notsupported,
+    '==' => \&notsupported,
+    '!=' => \&notsupported,
+    '>'  => \&notsupported,
+    '>=' => \&notsupported,
+    '<'  => \&notsupported,
+    '<=' => \&notsupported;
+
+method set(AcceptableInput $value, $reverse=)
+{
+    confess("set value must be defined") unless defined $value;
+    confess("${\ $self->parent } is not writable") unless $self->parent->writable;
+    my $shape = []; 
+    zip(
+        sub { my ($begin, $end) = @_; push @$shape, ($end-$begin); },
+        $self->begin, 
+        $self->end
+    );
+    if(ref $value)
+    {
+        if(blessed($value) and $value->isa('AI::MXNet::NDArray'))
+        {
+            $value = $value->as_in_context($self->parent->context);
+        }
+        elsif(blessed($value) and $value->isa('AI::MXNet::NDArray::Slice'))
+        {
+            $value = $value->sever->as_in_context($self->parent->context);
+        }
+        else
+        {
+            $value = AI::MXNet::NDArray->array($value, ctx => $self->parent->context);
+        }
+        confess("value $value does not match slice dim sizes [@$shape]")
+            if @{$value->shape} != @$shape;    
+        zip(
+            sub { 
+                my ($dsize, $vdsize) = @_; 
+                confess("Slice [@$shape]  != $value given as value") 
+                    if $dsize != $vdsize; 
+            },
+            $shape,
+            $value->shape
+        );
+        AI::MXNet::NDArray->_crop_assign(
+            $self->parent,
+            $value,
+            { out => $self->parent, begin => $self->begin, end => $self->end }
+        );
+    }
+    else
+    {
+        AI::MXNet::NDArray->_crop_assign_scalar(
+            $self->parent,
+            { "scalar" => $value, out => $self->parent, begin => $self->begin, end => $self->end }
+        );
+    }
+    return $self->parent;
+}
+
+method sever()
+{
+    return AI::MXNet::NDArray->crop(
+            $self->parent,
+            { begin => $self->begin, end => $self->end }
+    );
+}
+
+{
+    no warnings 'misc';
+    use attributes 'AI::MXNet::NDArray::Slice', \&AI::MXNet::NDArray::Slice::sever, 'lvalue';
+}
+sub notsupported  { confess("NDArray only support continuous slicing on axis 0"); }
+sub AUTOLOAD { notsupported() }
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
new file mode 100644
index 000000000000..c5fff1c6a832
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -0,0 +1,1048 @@
+package AI::MXNet::Optimizer;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::NDArray;
+use AI::MXNet::Random;
+use List::Util qw(max);
+
+=head1 NAME
+
+AI::MXNet::Optimizer - Common Optimization algorithms with regularizations.
+
+=head1  DESCRIPTION
+
+Common Optimization algorithms with regularizations.
+=cut
+
+use Mouse;
+use AI::MXNet::Function::Parameters;
+my %opt_registry;
+method get_opt_registry()
+{
+    return \%opt_registry;
+}
+
+method register()
+{
+    my $name = lc $self;
+    ($name) = $name =~ /::(\w+)$/;
+    if(exists $opt_registry{ $name })
+    {
+        my $existing = $opt_registry{ $name };
+        warn(
+            "WARNING: New optimizer $self.$name" 
+            ."is overriding existing optimizer $existing.$name"
+        );
+    }
+    $opt_registry{ $name } = $self;
+}
+
+=head2 create_optimizer
+
+        Create an optimizer with specified name.
+
+        Parameters
+        ----------
+        name: str
+            Name of required optimizer. Should be the name
+            of a subclass of Optimizer. Case insensitive.
+
+        rescale_grad : float
+            Rescaling factor on gradient. Normally should be 1/batch_size.
+
+        kwargs: dict
+            Parameters for optimizer
+
+        Returns
+        -------
+        opt : Optimizer
+            The result optimizer.
+=cut
+
+method create_optimizer(Str $name, %kwargs)
+{
+    if(exists $opt_registry{ lc $name })
+    {
+        my $rescale_grad = delete($kwargs{rescale_grad})//1;
+        return $opt_registry{ lc $name }->new(
+            rescale_grad => $rescale_grad,
+            %kwargs
+        );
+    }
+    confess("Cannot find optimizer $name");
+}
+
+*create = \&create_optimizer;
+
+has 'rescale_grad'        => (is => "rw", isa => "Num", default=>1);
+has 'lr'                  => (is => "rw", isa => "Num");
+has 'learning_rate'       => (is => "rw", isa => "Num", default => 0.01);
+has 'lr_scheduler'        => (is => "rw", isa => "Maybe[AI::MXNet::LRScheduler]");
+has 'wd'                  => (is => "rw", isa => "Num", default => 0);
+has 'lr_mult'             => (is => "rw", isa => "HashRef", default => sub { +{} });
+has 'wd_mult'             => (is => "rw", isa => "HashRef", , default => sub { +{} });
+has 'num_update'          => (is => "rw", isa => "Int");
+has 'begin_num_update'    => (is => "rw", isa => "Int", default => 0);
+has '_index_update_count' => (is => "rw", isa => "HashRef", default => sub { +{} });
+has 'clip_gradient'       => (is => "rw", isa => "Maybe[Num]");
+has 'param_idx2name'      => (is => "rw", isa => "HashRef[Str]", default => sub { +{} });
+has 'idx2name'            => (is => "rw", isa => "HashRef[Str]");
+has 'sym'                 => (is => "rw", isa => "Maybe[AI::MXNet::Symbol]");
+
+sub BUILD
+{
+    my $self = shift;
+    if($self->lr_scheduler)
+    {
+        $self->lr_scheduler->base_lr($self->learning_rate);
+    }
+    $self->lr($self->learning_rate);
+    $self->num_update($self->begin_num_update);
+    $self->idx2name({ %{ $self->param_idx2name } });
+    $self->set_lr_mult({});
+    $self->set_wd_mult({});
+}
+# Create additional optimizer state such as momentum.
+# override in implementations.
+method create_state($index, $weight){}
+
+# Update the parameters. override in implementations
+method update($index, $weight, $grad, $state){}
+
+# set lr scale is deprecated. Use set_lr_mult instead.
+method set_lr_scale($args_lrscale)
+{
+    Carp::cluck("set lr scale is deprecated. Use set_lr_mult instead.");
+}
+
+=head2 set_lr_mult
+
+        Set individual learning rate multipler for parameters
+
+        Parameters
+        ----------
+        args_lr_mult : dict of string/int to float
+            set the lr multipler for name/index to float.
+            setting multipler by index is supported for backward compatibility,
+            but we recommend using name and symbol.
+=cut
+
+method set_lr_mult(HashRef[Num] $args_lr_mult)
+{
+    $self->lr_mult({});
+    if($self->sym)
+    {
+        my $attr = $self->sym->attr_dict();
+        for my $name (@{ $self->sym->list_arguments() })
+        {
+            if(exists $attr->{ $name } and exists $attr->{ $name }{ __lr_mult__ })
+            {
+                $self->lr_mult->{ $name } = $attr->{ $name }{ __lr_mult__ };
+            }
+        }
+    }
+    $self->lr_mult({ %{ $self->lr_mult }, %{ $args_lr_mult } });
+}
+
+=head2 set_wd_mult
+
+        Set individual weight decay multipler for parameters.
+        By default wd multipler is 0 for all params whose name doesn't
+        end with _weight, if param_idx2name is provided.
+
+        Parameters
+        ----------
+        args_wd_mult : dict of string/int to float
+            set the wd multipler for name/index to float.
+            setting multipler by index is supported for backward compatibility,
+            but we recommend using name and symbol.
+=cut
+
+method set_wd_mult(HashRef[Num] $args_wd_mult)
+{
+    $self->wd_mult({});
+    for my $n (values %{ $self->idx2name })
+    {
+        if(not $n =~ /(?:_weight|_gamma)$/)
+        {
+            $self->wd_mult->{ $n } = 0;
+        }
+    }
+    if($self->sym)
+    {
+        my $attr = $self->sym->attr_dict();
+        for my $name (@{ $self->sym->list_arguments() })
+        {
+            if(exists $attr->{ $name } and exists $attr->{ $name }{ __wd_mult__ })
+            {
+                $self->wd_mult->{ $name } = $attr->{ $name }{ __wd_mult__ };
+            }
+        }
+    }
+    $self->wd_mult({ %{ $self->wd_mult }, %{ $args_wd_mult } });
+}
+
+method _update_count(Index $index)
+{
+    if(not exists $self->_index_update_count->{ $index })
+    {
+        $self->_index_update_count->{ $index } = $self->begin_num_update;
+    }
+    $self->_index_update_count->{ $index } += 1;
+    $self->num_update(max($self->_index_update_count->{ $index }, $self->num_update));
+}
+
+method _get_lr(Index $index)
+{
+    my $lr;
+    if($self->lr_scheduler)
+    {
+        $lr = &{$self->lr_scheduler}($self->num_update);
+    }
+    else
+    {
+        $lr = $self->lr;
+    }
+
+    if(exists $self->lr_mult->{ $index })
+    {
+        $lr *= $self->lr_mult->{ $index };
+    }
+    elsif(exists $self->idx2name->{ $index })
+    {
+        $lr *= $self->lr_mult->{ $self->idx2name->{ $index } }//1;
+    }
+    return $lr;
+}
+
+method _get_wd(Index $index)
+{
+    my $wd = $self->wd;
+    if(exists $self->wd_mult->{ $index })
+    {
+        $wd *= $self->wd_mult->{ $index };
+    }
+    elsif(exists $self->idx2name->{ $index })
+    {
+        $wd *= $self->wd_mult->{ $self->idx2name->{ $index } }//1;
+    }
+    return $wd;
+}
+
+=head1 NAME
+
+AI::MXNet::SGD - A very simple SGD optimizer with momentum and weight regularization.
+=cut
+
+=head1 DESCRIPTION
+
+    A very simple SGD optimizer with momentum and weight regularization.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        learning_rate of SGD
+
+    momentum : float, optional
+       momentum value
+
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+
+    rescale_grad : float, optional
+        rescaling factor of gradient. Normally should be 1/batch_size.
+
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+
+    param_idx2name : dict of string/int to float, optional
+        special treat weight decay in parameter ends with bias, gamma, and beta
+=cut
+
+package AI::MXNet::SGD;
+use Mouse;
+extends 'AI::MXNet::Optimizer';
+
+has 'kwargs'   => (is => "rw", isa => "HashRef[Num]");
+has 'momentum' => (is => "rw", isa => "Num", default => 0);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->kwargs({ rescale_grad => $self->rescale_grad });
+    if($self->momentum)
+    {
+        $self->kwargs->{momentum} = $self->momentum;
+    }
+    if($self->clip_gradient)
+    {
+        $self->kwargs->{clip_gradient} = $self->clip_gradient;
+    }
+}
+
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    if($self->momentum == 0)
+    {
+        return undef;
+    }
+    else
+    {
+        return AI::MXNet::NDArray->zeros(
+            $weight->shape, ctx => $weight->context, dtype => $weight->dtype
+        );
+    }
+}
+
+method update(
+    Index                     $index,
+    AI::MXNet::NDArray        $weight,
+    AI::MXNet::NDArray        $grad,
+    Maybe[AI::MXNet::NDArray] $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    if($state)
+    {
+        AI::MXNet::NDArray->sgd_mom_update(
+            $weight, $grad, $state,
+            {
+                out => $weight,
+                lr  => $lr,
+                wd  => $wd,
+                %{ $self->kwargs }
+            }
+        );
+    }
+    else
+    {
+        AI::MXNet::NDArray->sgd_update(
+            $weight,
+            $grad,
+            {
+                out => $weight,
+                lr  => $lr,
+                wd  => $wd,
+                %{ $self->kwargs }
+            }
+        );
+    }
+}
+
+__PACKAGE__->register;
+
+package AI::MXNet::DCASGD;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Optimizer';
+
+=head1 NAME
+
+AI::MXNet::DCASGD - DCASGD optimizer with momentum and weight regularization.
+=cut
+
+=head1 DESCRIPTION
+
+    DCASGD optimizer with momentum and weight regularization.
+
+    Implements paper "Asynchronous Stochastic Gradient Descent with
+                    Delay Compensation for Distributed Deep Learning"
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        learning_rate of SGD
+
+    momentum : float, optional
+       momentum value
+
+    lamda : float, optional
+       scale DC value
+
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+
+    rescale_grad : float, optional
+        rescaling factor of gradient. Normally should be 1/batch_size.
+
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+
+    param_idx2name : hash ref of string/int to float, optional
+        special treat weight decay in parameter ends with bias, gamma, and beta
+=cut
+has 'momentum'        => (is => 'ro', isa => 'Num', default => 0);
+has 'lamda'           => (is => 'ro', isa => 'Num', default => 0.04);
+has 'weight_previous' => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->weight_previous({});
+}
+
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+        return [
+            $self->momentum ? AI::MXNet::NDArray->zeros(
+                $weight->shape, ctx => $weight->context, dtype => $weight->dtype
+            ) : undef,
+            $weight->copy
+        ];
+}
+
+method update(
+    Index                     $index,
+    AI::MXNet::NDArray        $weight,
+    AI::MXNet::NDArray        $grad,
+    Maybe[AI::MXNet::NDArray] $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    $grad *= $self->rescale_grad;
+    if($self->clip_gradient)
+    {
+        $grad = AI::MXNet::NDArray->clip(
+            $grad,
+            -$self->clip_gradient,
+            $self->clip_gradient
+        );
+    }
+    my ($mom, $weight_previous) = @{ $state };
+    if(defined $mom)
+    {
+        $mom *= $self->momentum;
+        $mom += -$lr * (
+                $grad + $wd * $weight
+                    +
+                $self->lamda * $grad * $grad * ($weight - $weight_previous)
+        );
+    }
+    else
+    {
+        assert($self->momentum == 0);
+        $mom = -$lr * (
+                $grad + $wd * $weight
+                    +
+                $self->lamda * $grad * $grad * ($weight - $weight_previous)
+        );
+    }
+    $weight_previous .= $weight;
+    $weight += $mom;
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+    AI::MXNet::NAG - SGD with Nesterov weight handling.
+=cut
+
+=head1 DESCRIPTION
+
+    It is implemented according to
+    https://github.com/torch/optim/blob/master/sgd.lua
+=cut
+
+package AI::MXNet::NAG;
+use Mouse;
+
+extends 'AI::MXNet::SGD';
+
+method update(
+    Index $index,
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    AI::MXNet::NDArray|Undef $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    $grad = $grad * $self->rescale_grad;
+    if($self->clip_gradient)
+    {
+        $grad = AI::MXNet::NDArray->clip(
+            $grad, 
+            -$self->clip_gradient,
+            $self->clip_gradient
+        );
+    }
+    if($state)
+    {
+        my $mom  = $state;
+        $mom    *= $self->momentum;
+        $grad   += $wd * $weight;
+        $mom    += $grad;
+        $grad   += $self->momentum * $mom;
+        $weight += -$lr * $grad;
+    }
+    else
+    {
+        confess("momentum != 0") unless $self->momentum == 0;
+        $weight += -$lr * ($grad + $wd * $weight);
+    }
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::SLGD - Stochastic Langevin Dynamics Updater to sample from a distribution.
+=cut
+
+=head1 DESCRIPTION
+
+    Stochastic Langevin Dynamics Updater to sample from a distribution.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        learning_rate of SGD
+
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+
+    rescale_grad : float, optional
+        rescaling factor of gradient. Normally should be 1/batch_size.
+
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+
+    param_idx2name : dict of string/int to float, optional
+        special treat weight decay in parameter ends with bias, gamma, and beta
+=cut
+
+package AI::MXNet::SLGD;
+use Mouse;
+
+extends 'AI::MXNet::Optimizer';
+
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return undef;
+}
+
+method update(
+    Index $index, 
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    AI::MXNet::NDArray|Undef $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    $grad *= $self->rescale_grad;
+    if($self->clip_gradient)
+    {
+        $grad = AI::MXNet::NDArray->clip(
+            $grad,
+            -$self->clip_gradient,
+             $self->clip_gradient
+        );
+    }
+    $weight +=  - $lr/2 * ($grad + $wd * $weight)
+                    +
+                AI::MXNet::Random->normal(
+                        0, sqrt($lr),
+                        $weight->shape,
+                        $weight->context
+                );
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::Adam - Adam optimizer as described in [King2014]_.
+=cut
+
+=head1 DESCRIPTION
+
+    Adam optimizer as described in [King2014]_.
+
+    .. [King2014] Diederik Kingma, Jimmy Ba,
+       *Adam: A Method for Stochastic Optimization*,
+       http://arxiv.org/abs/1412.6980
+
+    the code in this class was adapted from
+    https://github.com/mila-udem/blocks/blob/master/blocks/algorithms/__init__.py#L765
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        Step size.
+        Default value is set to 0.001.
+    beta1 : float, optional
+        Exponential decay rate for the first moment estimates.
+        Default value is set to 0.9.
+    beta2 : float, optional
+        Exponential decay rate for the second moment estimates.
+        Default value is set to 0.999.
+    epsilon : float, optional
+        Default value is set to 1e-8.
+    decay_factor : float, optional
+        Default value is set to 1 - 1e-8.
+
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+    rescale_grad : float, optional
+        rescaling factor of gradient. Normally should be 1/batch_size.
+
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+=cut
+package AI::MXNet::Adam;
+use Mouse;
+
+extends 'AI::MXNet::Optimizer';
+
+has 'kwargs'   => (is => "rw", isa => "HashRef[Num]");
+has '+learning_rate' => (default => 0.001);
+has 'beta1'    => (is => "rw", isa => "Num", default => 0.9);
+has 'beta2'    => (is => "rw", isa => "Num", default => 0.999);
+has 'epsilon'  => (is => "rw", isa => "Num", default => 1e-8);
+has 'decay_factor'  => (is => "rw", isa => "Num", default => (1 - 1e-8));
+
+sub BUILD
+{
+    my $self = shift;
+    $self->kwargs({
+        rescale_grad => $self->rescale_grad,
+        beta1   => $self->beta1,
+        beta2   => $self->beta2,
+        epsilon => $self->epsilon
+    });
+    if($self->clip_gradient)
+    {
+        $self->kwargs->{clip_gradient} = $self->clip_gradient;
+    }
+}
+
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return [AI::MXNet::NDArray->zeros(
+                $weight->shape,
+                ctx => $weight->context,
+                dtype => $weight->dtype
+            ),  # mean
+            AI::MXNet::NDArray->zeros(
+                $weight->shape,
+                ctx => $weight->context,
+                dtype => $weight->dtype
+            )  # variance
+    ];
+}
+
+method update(
+    Index $index, 
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    ArrayRef[AI::MXNet::NDArray] $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    my $t = $self->_index_update_count->{$index};
+    my $coef1 = 1 - $self->beta1**$t;
+    my $coef2 = 1 - $self->beta2**$t;
+    $lr *= sqrt($coef2)/$coef1;
+    my ($mean, $var) = @{ $state };
+    AI::MXNet::NDArray->adam_update(
+        $weight, $grad, $mean, $var,
+        {
+            out => $weight,
+            lr  => $lr,
+            wd  => $wd,
+            %{ $self->kwargs }
+        }
+    );
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::AdaGrad - AdaGrad optimizer of Duchi et al., 2011
+=cut
+
+=head1 DESCRIPTION
+
+    AdaGrad optimizer of Duchi et al., 2011,
+
+    This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
+    by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
+    in some cases.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        Step size.
+        Default value is set to 0.05.
+
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+
+    rescale_grad : float, optional
+        rescaling factor of gradient. Normally should be 1/batch_size.
+
+    eps: float, optional
+        A small float number to make the updating processing stable
+        Default value is set to 1e-7.
+
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+=cut
+package AI::MXNet::AdaGrad;
+use Mouse;
+
+extends 'AI::MXNet::Optimizer';
+
+has 'float_stable_eps'    => (is => "rw", isa => "Num", default => 1e-7);
+has '+learning_rate'       => (default => 0.05);
+
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return AI::MXNet::NDArray->zeros(
+                $weight->shape, 
+                ctx => $weight->context
+    );  # history
+}
+
+method update(
+    Index $index,
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    AI::MXNet::NDArray $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    $grad *= $self->rescale_grad;
+    if($self->clip_gradient)
+    {
+        $grad = AI::MXNet::NDArray->clip(
+            $grad,
+            -$self->clip_gradient,
+             $self->clip_gradient
+        );
+    }
+    my $history = $state;
+    $history += ($grad * $grad);
+    $weight  += -$lr
+                    *
+                (
+                    $grad
+                        /
+                    AI::MXNet::NDArray->sqrt(
+                        $history
+                            +
+                        $self->float_stable_eps
+                    )
+                        +
+                    $wd * $weight
+                );
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::RMSProp - RMSProp optimizer of Tieleman & Hinton, 2012.
+=cut
+
+=head1 DESCRIPTION
+
+    RMSProp optimizer of Tieleman & Hinton, 2012,
+
+    For centered=False, the code follows the version in
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012
+
+    For centered=True, the code follows the version in
+    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        Step size.
+        Default value is set to 0.001.
+    gamma1: float, optional
+        decay factor of moving average for gradient^2.
+        Default value is set to 0.9.
+    gamma2: float, optional
+        "momentum" factor.
+        Default value if set to 0.9.
+        Only used if centered=True
+    epsilon : float, optional
+        Default value is set to 1e-8.
+    centered : bool, optional
+        Use Graves or Tielemans & Hintons version of RMSProp
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+    rescale_grad : float, optional
+        rescaling factor of gradient.
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+    clip_weights : float, optional
+        clip weights in range [-clip_weights, clip_weights]
+=cut
+
+package AI::MXNet::RMSProp;
+use Mouse;
+
+extends 'AI::MXNet::Optimizer';
+
+has '+learning_rate' => (default => 0.001);
+has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
+has 'centered'       => (is => "ro", isa => "Bool", default => 0);
+has 'clip_weights'   => (is => "ro", isa => "Num");
+has 'kwargs'         => (is => "rw", init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->kwargs({
+        rescale_grad => $self->rescale_grad,
+        gamma1       => $self->gamma1,
+        epsilon      => $self->epsilon
+    });
+    if($self->centered)
+    {
+        $self->kwargs->{gamma2} = $self->gamma2;
+    }
+    if($self->clip_gradient)
+    {
+        $self->kwargs->{clip_gradient} = $self->clip_gradient;
+    }
+    if($self->clip_weights)
+    {
+        $self->kwargs->{clip_weights} = $self->clip_weights;
+    }
+}
+
+# For centered=False: n
+# For centered=True: n, g, delta
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return [
+            $self->centered
+            ? (
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                ),  # n
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                ),  # g
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                )
+            )   # delta
+            : (
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                ),  # n
+            )
+    ];
+}
+
+method update(
+    Index $index,
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    ArrayRef[AI::MXNet::NDArray] $state
+)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    my ($n, $g, $delta) = @{ $state };
+    if($self->centered)
+    {
+        AI::MXNet::NDArray->rmspropalex_update(
+            $weight, $grad, $n, $g, $delta,
+            {
+                out => $weight,
+                lr  => $lr,
+                wd  => $wd,
+                %{ $self->kwargs }
+            }
+        );
+    }
+    else
+    {
+        AI::MXNet::NDArray->rmsprop_update(
+            $weight, $grad, $n,
+            {
+                out => $weight,
+                lr  => $lr,
+                wd  => $wd,
+                %{ $self->kwargs }
+            }
+        );
+    }
+}
+
+__PACKAGE__->register;
+
+=head1 NAME
+
+AI::MXNet::AdaDelta - AdaDelta optimizer.
+=cut
+
+=head1 DESCRIPTION
+
+    AdaDelta optimizer as described in
+    Zeiler, M. D. (2012).
+    *ADADELTA: An adaptive learning rate method.*
+
+    http://arxiv.org/abs/1212.5701
+
+    Parameters
+    ----------
+    rho: float
+        Decay rate for both squared gradients and delta x
+    epsilon : float
+        The constant as described in the thesis
+    wd : float
+        L2 regularization coefficient add to all the weights
+    rescale_grad : float, optional
+        rescaling factor of gradient. Normally should be 1/batch_size.
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+=cut
+package AI::MXNet::AdaDelta;
+use Mouse;
+
+extends 'AI::MXNet::Optimizer';
+
+has 'rho'    => (is => "rw", isa => "Num", default => 0.9);
+has 'epsilon'    => (is => "rw", isa => "Num", default => 1e-5);
+
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return [
+            AI::MXNet::NDArray->zeros(
+                $weight->shape,
+                ctx => $weight->context
+            ),  # accumulated g
+            AI::MXNet::NDArray->zeros(
+                $weight->shape,
+                ctx => $weight->context
+            )   # accumulated delta
+    ];
+}
+
+method update(
+    Index $index,
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    ArrayRef[AI::MXNet::NDArray] $state
+)
+{
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    $grad *= $self->rescale_grad;
+    if($self->clip_gradient)
+    {
+        $grad = AI::MXNet::NDArray->clip(
+            $grad,
+            -$self->clip_gradient,
+             $self->clip_gradient
+        );
+    }
+    my ($acc_g, $acc_delta) = @{ $state };
+    $acc_g .= $self->rho * $acc_g + (1 - $self->rho) * $grad * $grad;
+    my $current_delta = ($acc_delta + $self->epsilon)->sqrt
+                            /
+                        ($acc_g + $self->epsilon)->sqrt
+                            *
+                        $grad;
+    $acc_delta .= $self->rho * $acc_delta + (1 - $self->rho) * $current_delta * $current_delta;
+    $weight -= $current_delta + $wd * $weight;
+}
+
+__PACKAGE__->register;
+
+# For test use
+package AI::MXNet::Test;
+use Mouse;
+
+extends 'AI::MXNet::Optimizer';
+
+# Create a state to duplicate weight
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return AI::MXNet::NDArray->zeros(
+                $weight->shape, 
+                ctx => $weight->context
+    );
+}
+
+# performs w += rescale_grad * grad
+method update(
+    Index $index,
+    AI::MXNet::NDArray $weight,
+    AI::MXNet::NDArray $grad,
+    AI::MXNet::NDArray $state
+)
+{
+    $weight += $grad * $self->rescale_grad;
+    $state .= $weight;
+}
+
+__PACKAGE__->register;
+
+# updater for kvstore
+package AI::MXNet::Updater;
+use Mouse;
+use Storable qw(thaw freeze);
+use overload "&{}" => sub { my $self = shift; sub { $self->call(@_) } },
+             fallback => 1;
+
+has "optimizer" => (is => "rw", isa => "AI::MXNet::Optimizer");
+has "states"    => (is => "rw", isa => "HashRef", default => sub { +{} });
+
+method call(Index $index, AI::MXNet::NDArray $grad, AI::MXNet::NDArray $weight)
+{
+    if(not exists $self->states->{ $index })
+    {
+        $self->states->{ $index } = $self->optimizer->create_state($index, $weight);
+    }
+    $self->optimizer->update($index, $weight, $grad, $self->states->{ $index });
+}
+*slice = *call;
+
+method set_states($states)
+{
+    $self->states(thaw($states));
+}
+
+method get_states()
+{
+    return freeze($self->states);
+}
+
+package AI::MXNet::Optimizer;
+
+
+method get_updater(AI::MXNet::Optimizer $optimizer)
+{
+    return AI::MXNet::Updater->new(optimizer => $optimizer);
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Profiler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Profiler.pm
new file mode 100644
index 000000000000..d1564c14fc0a
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Profiler.pm
@@ -0,0 +1,55 @@
+package AI::MXNet::Profiler;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+
+=head2 profiler_set_config
+
+    Set up the configure of profiler.
+
+    Parameters
+    ----------
+    mode : string, optional
+        Indicting whether to enable the profiler, can
+        be 'symbolic' or 'all'. Default is `symbolic`.
+    filename : string, optional
+        The name of output trace file. Default is
+        'profile.json'.
+=cut
+
+method profiler_set_config(ProfilerMode $mode='symbolic', Str $filename='profile.json')
+{
+    my %mode2int = qw/symbolic 0 all 1/;
+    check_call(AI::MXNet::SetProfilerConfig($mode2int{ $mode }, $filename));
+}
+
+=head2 profiler_set_state
+
+    Set up the profiler state to record operator.
+
+    Parameters
+    ----------
+    state : string, optional
+        Indicting whether to run the profiler, can
+        be 'stop' or 'run'. Default is `stop`.
+=cut
+
+method profiler_set_state(ProfilerState $state='stop')
+{
+    my %state2int = qw/stop 0 run 1/;
+    check_call(AI::MXNet::SetProfilerState($state2int{ $state }));
+}
+
+=head2 dump_profile
+
+    Dump profile and stop profiler. Use this to save profile
+    in advance in case your program cannot exit normally
+=cut
+
+method dump_profile()
+{
+    check_call(AI::MXNetCAPI::DumpProfile());
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN.pm
new file mode 100644
index 000000000000..7ad2d0b596ae
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RNN.pm
@@ -0,0 +1,158 @@
+package AI::MXNet::RNN;
+use strict;
+use warnings;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::RNN::IO;
+use AI::MXNet::RNN::Cell;
+use List::Util qw(max);
+
+=encoding UTF-8
+
+=head1 NAME
+
+    AI::MXNet::RNN - Functions for constructing recurrent neural networks.
+=cut
+
+=head1 SYNOPSIS
+
+
+=head1 DESCRIPTION
+
+    Functions for constructing recurrent neural networks.
+=cut
+
+=head2 save_rnn_checkpoint
+
+    Save checkpoint for model using RNN cells.
+    Unpacks weight before saving.
+
+    Parameters
+    ----------
+    cells : AI::MXNet::RNN::Cell or array ref of AI::MXNet::RNN::Cell
+        The RNN cells used by this symbol.
+    prefix : str
+        Prefix of model name.
+    epoch : int
+        The epoch number of the model.
+    symbol : Symbol
+        The input symbol
+    arg_params : hash ref of str to AI::MXNet::NDArray
+        Model parameter, hash ref of name to NDArray of net's weights.
+    aux_params : hash ref of str to AI::MXNet::NDArray
+        Model parameter, hash ref of name to NDArray of net's auxiliary states.
+
+    Notes
+    -----
+    - prefix-symbol.json will be saved for symbol.
+    - prefix-epoch.params will be saved for parameters.
+=cut
+
+method save_rnn_checkpoint(
+    AI::MXNet::RNN::Cell::Base|ArrayRef[AI::MXNet::RNN::Cell::Base] $cells,
+    Str                                                             $prefix,
+    Int                                                             $epoch,
+    AI::MXNet::Symbol                                               $symbol,
+    HashRef[AI::MXNet::NDArray]                                     $arg_params,
+    HashRef[AI::MXNet::NDArray]                                     $aux_params
+)
+{
+    $cells = [$cells] unless ref $cells eq 'ARRAY';
+    $arg_params = $_->unpack_weights($arg_params) for @{ $cells };
+    AI::MXNet::Module->model_save_checkpoint($prefix, $epoch, $symbol, $arg_params, $aux_params);
+}
+
+
+=head2 load_rnn_checkpoint
+
+    Load model checkpoint from file.
+    Pack weights after loading.
+
+    Parameters
+    ----------
+    cells : AI::MXNet::RNN::Cell or ir array ref of AI::MXNet::RNN::Cell
+        The RNN cells used by this symbol.
+    prefix : str
+        Prefix of model name.
+    epoch : int
+        Epoch number of model we would like to load.
+
+    Returns
+    -------
+    symbol : Symbol
+        The symbol configuration of computation network.
+    arg_params : hash ref of str to NDArray
+        Model parameter, dict of name to NDArray of net's weights.
+    aux_params : hash ref of str to NDArray
+        Model parameter, dict of name to NDArray of net's auxiliary states.
+
+    Notes
+    -----
+    - symbol will be loaded from prefix-symbol.json.
+    - parameters will be loaded from prefix-epoch.params.
+=cut
+
+method load_rnn_checkpoint(
+    AI::MXNet::RNN::Cell::Base|ArrayRef[AI::MXNet::RNN::Cell::Base] $cells,
+    Str                                                             $prefix,
+    Int                                                             $epoch
+)
+{
+    my ($sym, $arg, $aux) = AI::MXNet::Module->load_checkpoint($prefix, $epoch);
+    $cells = [$cells] unless ref $cells eq 'ARRAY';
+    $arg = $_->pack_weights($arg) for @{ $cells };
+    return ($sym, $arg, $aux);
+}
+
+=head2 do_rnn_checkpoint
+
+    Make a callback to checkpoint Module to prefix every epoch.
+    unpacks weights used by cells before saving.
+
+    Parameters
+    ----------
+    cells : subclass of RNN::Cell
+        RNN cells used by this module.
+    prefix : str
+        The file prefix to checkpoint to
+    period : int
+        How many epochs to wait before checkpointing. Default is 1.
+
+    Returns
+    -------
+    callback : function
+        The callback function that can be passed as iter_end_callback to fit.
+=cut
+
+method do_rnn_checkpoint(
+    AI::MXNet::RNN::Cell::Base|ArrayRef[AI::MXNet::RNN::Cell::Base]  $cells,
+    Str                                                              $prefix,
+    Int                                                              $period
+)
+{
+    $period = max(1, $period);
+    return sub {
+        my ($iter_no, $sym, $arg, $aux) = @_;
+        if (($iter_no + 1) % $period == 0)
+        {
+            __PACKAGE__->save_rnn_checkpoint($cells, $prefix, $iter_no+1, $sym, $arg, $aux);
+        }
+    };
+}
+
+## In order to closely resemble the Python's usage
+method RNNCell(@args)            { AI::MXNet::RNN::Cell->new(@args % 2 ? ('num_hidden', @args) : @args) }
+method LSTMCell(@args)           { AI::MXNet::RNN::LSTMCell->new(@args % 2 ? ('num_hidden', @args) : @args) }
+method GRUCell(@args)            { AI::MXNet::RNN::GRUCell->new(@args % 2 ? ('num_hidden', @args) : @args) }
+method FusedRNNCell(@args)       { AI::MXNet::RNN::FusedCell->new(@args % 2 ? ('num_hidden', @args) : @args) }
+method SequentialRNNCell(@args)  { AI::MXNet::RNN::SequentialCell->new(@args) }
+method BidirectionalCell(@args)  { AI::MXNet::RNN::BidirectionalCell->new(@args) }
+method DropoutCell(@args)        { AI::MXNet::RNN::DropoutCell->new(@args) }
+method encode_sentences(@args)   { AI::MXNet::RNN::IO->encode_sentences(@args) }
+method BucketSentenceIter(@args)
+{
+    my $sentences  = shift(@args);
+    my $batch_size = shift(@args);
+    AI::MXNet::BucketSentenceIter->new(sentences => $sentences, batch_size => $batch_size, @args);
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
new file mode 100644
index 000000000000..863ad4d6281c
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
@@ -0,0 +1,1504 @@
+package AI::MXNet::RNN::Params;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::RNN::Params
+=cut
+
+=head1 DESCRIPTION
+
+Container for holding variables.
+Used by RNN cells for parameter sharing between cells.
+
+Parameters
+----------
+prefix : str
+    All variables name created by this container will
+    be prepended with the prefix
+=cut
+has '_prefix' => (is => 'ro', init_arg => 'prefix', isa => 'Str', default => '');
+has '_params' => (is => 'rw', init_arg => undef);
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(prefix => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_params({});
+}
+
+
+=head2 get
+
+Get a variable with the name or create a new one id does not exist.
+
+Parameters
+----------
+$name : str
+    name of the variable
+@kwargs:
+    more arguments that are passed to mx->sym->Variable call
+=cut
+
+method get(Str $name, @kwargs)
+{
+    $name = $self->_prefix . $name;
+    if(not exists $self->_params->{$name})
+    {
+        $self->_params->{$name} = AI::MXNet::Symbol->Variable($name, @kwargs);
+    }
+    return $self->_params->{$name};
+}
+
+package AI::MXNet::RNN::Cell::Base;
+=head1 NAME
+
+AI::MXNet::RNNCell::Base
+=cut
+
+=head1 DESCRIPTION
+
+Abstract base class for RNN cells
+
+Parameters
+----------
+prefix : str
+    prefix for name of layers
+    (and name of weight if params is undef)
+params : AI::MXNet::RNN::Params or undef
+    container for weight sharing between cells.
+    created if undef.
+=cut
+
+use AI::MXNet::Base;
+use Mouse;
+use overload "&{}"  => sub { my $self = shift; sub { $self->call(@_) } };
+has '_prefix'       => (is => 'rw', init_arg => 'prefix', isa => 'Str', default => '');
+has '_params'       => (is => 'rw', init_arg => 'params', isa => 'Maybe[AI::MXNet::RNN::Params]');
+has [qw/_own_params
+        _modified
+        _init_counter
+        _counter
+                 /] => (is => 'rw', init_arg => undef);
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(prefix => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    if(not defined $self->_params)
+    {
+        $self->_own_params(1);
+        $self->_params(AI::MXNet::RNN::Params->new($self->_prefix));
+    }
+    else
+    {
+        $self->_own_params(0);
+    }
+    $self->_modified(0);
+    $self->reset;
+}
+
+=head2 reset
+
+Reset before re-using the cell for another graph
+=cut
+
+method reset()
+{
+    $self->_init_counter(-1);
+    $self->_counter(-1);
+}
+
+=head2 call
+
+Construct symbol for one step of RNN.
+
+Parameters
+----------
+$inputs : mx->sym->Variable
+    input symbol, 2D, batch * num_units
+%states : mx->sym->Variable or ArrayRef[AI::MXNet::Symbol]
+    state from previous step or begin_state().
+
+Returns
+-------
+$output : AI::MXNet::Symbol
+    output symbol
+$states : ArrayRef[AI::MXNet::Symbol]
+    state to next step of RNN.
+Can be called via overloaded &{}: &{$cell}($inputs, $states);
+=cut
+
+method call(AI::MXNet::Symbol $inputs, AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol] $states)
+{
+    confess("Not Implemented");
+}
+
+method _gate_names()
+{
+    [''];
+}
+
+=head2 params
+
+Parameters of this cell
+=cut
+
+method params()
+{
+    $self->_own_params(0);
+    return $self->_params;
+}
+
+=head2 state_shape
+
+shape(s) of states
+=cut
+
+method state_shape()
+{
+    confess("Not Implemented");
+}
+
+=head2 begin_state
+
+Initial state for this cell.
+
+Parameters
+----------
+$:func : sub ref, default is AI::MXNet::Symbol->can('zeros')
+    Function for creating initial state.
+    Can be AI::MXNet::Symbol->can('zeros'),
+    AI::MXNet::Symbol->can('uniform'), AI::MXNet::Symbol->can('Variable') etc.
+    Use AI::MXNet::Symbol->can('Variable') if you want to directly
+    feed input as states.
+@kwargs :
+    more keyword arguments passed to func. For example
+    mean, std, dtype, etc.
+
+Returns
+-------
+$states : ArrayRef[AI::MXNet::Symbol]
+    starting states for first RNN step
+=cut
+
+method begin_state(CodeRef :$func=AI::MXNet::Symbol->can('zeros'), @kwargs)
+{
+    assert(
+        (not $self->_modified),
+        "After applying modifier cells (e.g. DropoutCell) the base "
+        ."cell cannot be called directly. Call the modifier cell instead."
+    );
+    my @states;
+    my $func_needs_named_name = $func ne AI::MXNet::Symbol->can('Variable');
+    for my $shape (@{ $self->state_shape })
+    {
+        $self->_init_counter($self->_init_counter + 1);
+        my @name = (sprintf("%sbegin_state_%d", $self->_prefix, $self->_init_counter));
+        if($func_needs_named_name)
+        {
+            unshift(@name, 'name');
+        }
+        my $state = &{$func}(
+            'AI::MXNet::Symbol',
+            @name,
+            (defined $shape ? (shape => $shape) : ()),
+            @kwargs
+        );
+        push @states, $state;
+    }
+    return \@states;
+}
+
+=head2 unpack_weights
+
+Unpack fused weight matrices into separate
+weight matrices
+
+Parameters
+----------
+$args : HashRef[AI::MXNet::NDArray]
+    hash ref containing packed weights.
+    usually from AI::MXNet::Module->get_output()
+
+Returns
+-------
+$args : HashRef[AI::MXNet::NDArray]
+    hash ref with weights associated with
+    this cell, unpacked.
+=cut
+
+method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    my %args = %{ $args };
+    my $h = $self->_num_hidden;
+    for my $group_name ('i2h', 'h2h')
+    {
+        my $weight = delete $args{ sprintf('%s%s_weight', $self->_prefix, $group_name) };
+        my $bias   = delete $args{ sprintf('%s%s_bias', $self->_prefix, $group_name) };
+        enumerate(sub {
+            my ($j, $name) = @_;
+            my $wname = sprintf('%s%s%s_weight', $self->_prefix, $group_name, $name);
+            $args->{$wname} = $weight->slice([$j*$h,($j+1)*$h-1])->copy;
+            my $bname = sprintf('%s%s%s_bias', $self->_prefix, $group_name, $name);
+            $args->{$bname} = $bias->slice([$j*$h,($j+1)*$h-1])->copy;
+        }, $self->_gate_names);
+    }
+    return \%args;
+}
+
+=head2 pack_weights
+
+Pack fused weight matrices into common
+weight matrices
+
+Parameters
+----------
+args : HashRef[AI::MXNet::NDArray]
+    hash ref containing unpacked weights.
+
+Returns
+-------
+$args : HashRef[AI::MXNet::NDArray]
+    hash ref with weights associated with
+    this cell, packed.
+=cut
+
+method pack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    my %args = %{ $args };
+    my $h = $self->_num_hidden;
+    for my $group_name ('i2h', 'h2h')
+    {
+        my @weight;
+        my @bias;
+        for my $name (@{ $self->_gate_names })
+        {
+            my $wname = sprintf('%s%s%s_weight', $self->_prefix, $group_name, $name);
+            push @weight, delete $args{$wname};
+            my $bname = sprintf('%s%s%s_bias', $self->_prefix, $group_name, $name);
+            push @bias, delete $args{$bname};
+        }
+        $args{ sprintf('%s%s_weight', $self->_prefix, $group_name) } = AI::MXNet::NDArray->concatenate(
+            \@weight
+        );
+        $args{ sprintf('%s%s_bias', $self->_prefix, $group_name) } = AI::MXNet::NDArray->concatenate(
+            \@bias
+        );
+    }
+    return \%args;
+}
+
+=head2 unroll
+
+Unroll an RNN cell across time steps.
+
+Parameters
+----------
+:$length : Int
+    number of steps to unroll
+:$inputs : AI::MXNet::Symbol, array ref of Symbols, or undef
+    if inputs is a single Symbol (usually the output
+    of Embedding symbol), it should have shape
+    of [$batch_size, $length, ...] if layout == 'NTC' (batch, time series)
+    or ($length, $batch_size, ...) if layout == 'TNC' (time series, batch).
+
+    If inputs is a array ref of symbols (usually output of
+    previous unroll), they should all have shape
+    ($batch_size, ...).
+
+    If inputs is undef, a placeholder variables are
+    automatically created.
+:$begin_state : array ref of Symbol
+    input states. Created by begin_state()
+    or output state of another cell. Created
+    from begin_state() if undef.
+:$input_prefix : str
+    prefix for automatically created input
+    placehodlers.
+:$layout : str
+    layout of input symbol. Only used if the input
+    is a single Symbol.
+:$merge_outputs : Bool
+    If 0, returns outputs as an array ref of Symbols.
+    If 1, concatenates the output across the time steps
+    and returns a single symbol with the shape
+    [$batch_size, $length, ...) if the layout equal to 'NTC',
+    or [$length, $batch_size, ...) if the layout equal tp 'TNC'.
+    If undef, output whatever is faster
+
+Returns
+-------
+$outputs : array ref of Symbol or Symbol
+    output symbols.
+$states : Symbol or nested list of Symbol
+    has the same structure as begin_state()
+=cut
+
+
+method unroll(
+    Int $length,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
+    Str                                                  :$input_prefix='',
+    Str                                                  :$layout='NTC',
+    Maybe[Bool]                                          :$merge_outputs=
+)
+{
+    $self->reset;
+    my $axis = index($layout, 'T');
+    if(not defined $inputs)
+    {
+        $inputs = [
+            map { AI::MXNet::Symbol->Variable("${input_prefix}t${_}_data") } (0..$length-1)
+        ];
+    }
+    elsif(blessed($inputs))
+    {
+        assert(
+            (@{ $inputs->list_outputs() } == 1),
+            "unroll doesn't allow grouped symbol as input. Please "
+            ."convert to list first or let unroll handle slicing"
+        );
+        $inputs = AI::MXNet::Symbol->SliceChannel(
+            $inputs,
+            axis         => $axis,
+            num_outputs  => $length,
+            squeeze_axis => 1
+        );
+    }
+    else
+    {
+        assert(@$inputs == $length);
+    }
+    $begin_state //= $self->begin_state;
+    my $states = $begin_state;
+    my $outputs;
+    my @inputs = @{ $inputs };
+    for my $i (0..$length-1)
+    {
+        my $output;
+        ($output, $states) = &{$self}(
+            $inputs[$i],
+            $states
+        );
+        push @$outputs, $output;
+    }
+    if($merge_outputs)
+    {
+        @$outputs = map { AI::MXNet::Symbol->expand_dims($_, axis => $axis) } @$outputs;
+        $outputs = AI::MXNet::Symbol->Concat(@$outputs, dim => $axis);
+    }
+    return($outputs, $states);
+}
+
+method _get_activation($inputs, $activation, @kwargs)
+{
+    if(not ref $activation)
+    {
+        return AI::MXNet::Symbol->Activation($inputs, act_type => $activation, @kwargs);
+    }
+    else
+    {
+        return &{$activation}($inputs, @kwargs);
+    }
+}
+
+method _cells_state_shape($cells)
+{
+    return [map { @{ $_->state_shape } } @$cells];
+}
+
+method _cells_begin_state($cells, @kwargs)
+{
+    return [map { @{ $_->begin_state(@kwargs) } } @$cells];
+}
+
+method _cells_unpack_weights($cells, $args)
+{
+    $args = $_->unpack_weights($args) for @$cells;
+    return $args;
+}
+
+method _cells_pack_weights($cells, $args)
+{
+    $args = $_->pack_weights($args) for @$cells;
+    return $args;
+}
+
+package AI::MXNet::RNN::Cell;
+use Mouse;
+extends 'AI::MXNet::RNN::Cell::Base';
+
+=head1 NAME 
+
+AI::MXNet::RNN::Cell
+=cut
+
+=head1 DESCRIPTION
+
+Simple recurrent neural network cell
+
+Parameters
+----------
+num_hidden : int
+    number of units in output symbol
+activation : str or Symbol, default 'tanh'
+    type of activation function
+prefix : str, default 'rnn_'
+    prefix for name of layers
+    (and name of weight if params is undef)
+params : AI::MXNet::RNNParams or undef
+    container for weight sharing between cells.
+    created if undef.
+=cut
+
+has '_num_hidden'  => (is => 'ro', init_arg => 'num_hidden', isa => 'Int', required => 1);
+has 'forget_bias'  => (is => 'ro', isa => 'Num');
+has '_activation'  => (
+    is       => 'ro',
+    init_arg => 'activation',
+    isa      => 'Activation',
+    default  => 'tanh'
+);
+has '+_prefix'    => (default => 'rnn_');
+has [qw/_iW _iB
+        _hW _hB/] => (is => 'rw', init_arg => undef);
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(num_hidden => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_iW($self->params->get('i2h_weight'));
+    $self->_iB(
+        $self->params->get(
+            'i2h_bias',
+            (defined($self->forget_bias)
+                ? (init => AI::MXNet::LSTMBias->new(forget_bias => $self->forget_bias))
+                : ()
+            )
+        )
+    );
+    $self->_hW($self->params->get('h2h_weight'));
+    $self->_hB($self->params->get('h2h_bias'));
+}
+
+method state_shape()
+{
+    return [[0, $self->_num_hidden]];
+}
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    $self->_counter($self->_counter + 1);
+    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
+    my $i2h = AI::MXNet::Symbol->FullyConnected(
+        data       => $inputs,
+        weight     => $self->_iW,
+        bias       => $self->_iB,
+        num_hidden => $self->_num_hidden,
+        name       => "${name}i2h"
+    );
+    my $h2h = AI::MXNet::Symbol->FullyConnected(
+        data       => @{$states}[0],
+        weight     => $self->_hW,
+        bias       => $self->_hB,
+        num_hidden => $self->_num_hidden,
+        name       => "${name}h2h"
+    );
+    my $output = $self->_get_activation(
+        $i2h + $h2h,
+        $self->_activation,
+        name       => "${name}out"
+    );
+    return ($output, [$output]);
+}
+
+package AI::MXNet::RNN::LSTMCell;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RNN::Cell';
+
+=head1 NAME 
+
+    AI::MXNet::RNN::LSTMCell
+=cut
+
+=head1 DESCRIPTION
+
+Long-Short Term Memory (LSTM) network cell.
+
+Parameters
+----------
+num_hidden : int
+    number of units in output symbol
+prefix : str, default 'lstm_'
+    prefix for name of layers
+    (and name of weight if params is undef)
+params : AI::MXNet::RNN::Params or None
+    container for weight sharing between cells.
+    created if undef.
+forget_bias : bias added to forget gate, default 1.0.
+    Jozefowicz et al. 2015 recommends setting this to 1.0
+=cut
+
+has '+_prefix'     => (default => 'lstm_');
+has '+_activation' => (init_arg => undef);
+has '+forget_bias' => (is => 'ro', isa => 'Num', default => 1);
+
+method state_shape()
+{
+    return [[0, $self->_num_hidden], [0, $self->_num_hidden]];
+}
+
+method _gate_names()
+{
+    [qw/_i _f _c _o/];
+}
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    $self->_counter($self->_counter + 1);
+    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
+    my @states = @{ $states };
+    my $i2h = AI::MXNet::Symbol->FullyConnected(
+        data       => $inputs,
+        weight     => $self->_iW,
+        bias       => $self->_iB,
+        num_hidden => $self->_num_hidden*4,
+        name       => "${name}i2h"
+    );
+    my $h2h = AI::MXNet::Symbol->FullyConnected(
+        data       => $states[0],
+        weight     => $self->_hW,
+        bias       => $self->_hB,
+        num_hidden => $self->_num_hidden*4,
+        name       => "${name}h2h"
+    );
+    my $gates = $i2h + $h2h;
+    my @slice_gates = @{ AI::MXNet::Symbol->SliceChannel(
+        $gates, num_outputs => 4, name => "${name}slice"
+    ) };
+    my $in_gate = AI::MXNet::Symbol->Activation(
+        $slice_gates[0], act_type => "sigmoid", name => "${name}i"
+    );
+    my $forget_gate = AI::MXNet::Symbol->Activation(
+        $slice_gates[1], act_type => "sigmoid", name => "${name}f"
+    );
+    my $in_transform = AI::MXNet::Symbol->Activation(
+        $slice_gates[2], act_type => "tanh", name => "${name}c"
+    );
+    my $out_gate = AI::MXNet::Symbol->Activation(
+        $slice_gates[3], act_type => "sigmoid", name => "${name}o"
+    );
+    my $next_c = AI::MXNet::Symbol->_plus(
+        $forget_gate * $states[1], $in_gate * $in_transform,
+        name => "${name}state"
+    );
+    my $next_h = AI::MXNet::Symbol->_mul(
+        $out_gate,
+        AI::MXNet::Symbol->Activation(
+            $next_c, act_type => "tanh"
+        ),
+        name => "${name}out"
+    );
+    return ($next_h, [$next_h, $next_c]);
+
+}
+
+package AI::MXNet::RNN::GRUCell;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RNN::Cell';
+
+=head1 NAME
+
+AI::MXNet::RNN::GRUCell
+=cut
+
+=head1 DESCRIPTION
+
+Gated Rectified Unit (GRU) network cell.
+Note: this is an implementation of the cuDNN version of GRUs
+(slight modification compared to Cho et al. 2014).
+
+Parameters
+----------
+num_hidden : int
+    number of units in output symbol
+prefix : str, default 'gru_'
+    prefix for name of layers
+    (and name of weight if params is undef)
+params : AI::MXNet::RNN::Params or undef
+    container for weight sharing between cells.
+    created if undef.
+=cut
+
+has '+_prefix'     => (default => 'gru_');
+
+method _gate_names()
+{
+    [qw/_r _z _o/];
+}
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    $self->_counter($self->_counter + 1);
+    my $name = sprintf('%st%d_', $self->_prefix, $self->_counter);
+    my $prev_state_h = @{ $states }[0];
+    my $i2h = AI::MXNet::Symbol->FullyConnected(
+        data       => $inputs,
+        weight     => $self->_iW,
+        bias       => $self->_iB,
+        num_hidden => $self->_num_hidden*3,
+        name       => "${name}i2h"
+    );
+    my $h2h = AI::MXNet::Symbol->FullyConnected(
+        data       => $prev_state_h,
+        weight     => $self->_hW,
+        bias       => $self->_hB,
+        num_hidden => $self->_num_hidden*3,
+        name       => "${name}h2h"
+    );
+    my ($i2h_r, $i2h_z);
+    ($i2h_r, $i2h_z, $i2h) = @{ AI::MXNet::Symbol->SliceChannel(
+        $i2h, num_outputs => 3, name => "${name}_i2h_slice"
+    ) };
+    my ($h2h_r, $h2h_z);
+    ($h2h_r, $h2h_z, $h2h) = @{ AI::MXNet::Symbol->SliceChannel(
+        $h2h, num_outputs => 3, name => "${name}_h2h_slice"
+    ) };
+    my $reset_gate = AI::MXNet::Symbol->Activation(
+        $i2h_r + $h2h_r, act_type => "sigmoid", name => "${name}_r_act"
+    );
+    my $update_gate = AI::MXNet::Symbol->Activation(
+        $i2h_z + $h2h_z, act_type => "sigmoid", name => "${name}_z_act"
+    );
+    my $next_h_tmp = AI::MXNet::Symbol->Activation(
+        $i2h + $reset_gate * $h2h, act_type => "tanh", name => "${name}_h_act"
+    );
+    my $next_h = AI::MXNet::Symbol->_plus(
+        (1 - $update_gate) * $next_h_tmp, $update_gate * $prev_state_h,
+        name => "${name}out"
+    );
+    return ($next_h, [$next_h]);
+}
+
+package AI::MXNet::RNN::FusedCell;
+use Mouse;
+use AI::MXNet::Types;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RNN::Cell::Base';
+
+=head1 NAME
+
+AI::MXNet::RNN::FusedCell
+=cut
+
+=head1 DESCRIPTION
+
+Fusing RNN layers across time step into one kernel.
+Improves speed but is less flexible. Currently only
+supported if using cuDNN on GPU.
+=cut
+
+has '_num_hidden'      => (is => 'ro', isa => 'Int',  init_arg => 'num_hidden',     required => 1);
+has '_num_layers'      => (is => 'ro', isa => 'Int',  init_arg => 'num_layers',     default => 1);
+has '_dropout'         => (is => 'ro', isa => 'Num',  init_arg => 'dropout',        default => 0);
+has '_get_next_state'  => (is => 'ro', isa => 'Bool', init_arg => 'get_next_state', default => 0);
+has '_bidirectional'   => (is => 'ro', isa => 'Bool', init_arg => 'bidirectional',  default => 0);
+has 'forget_bias'      => (is => 'ro', isa => 'Num',  default => 1);
+has 'initializer'      => (is => 'rw', isa => 'Maybe[AI::MXNet::Initializer]');
+has '_mode'            => (
+    is => 'ro',
+    isa => enum([qw/rnn_relu rnn_tanh lstm gru/]),
+    init_arg => 'mode',
+    default => 'lstm'
+);
+has [qw/_parameter
+        _directions/] => (is => 'rw', init_arg => undef);
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(num_hidden => $_[0]) if @_ == 1;
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    if(not $self->_prefix)
+    {
+        $self->_prefix($self->_mode.'_');
+    }
+    if(not defined $self->initializer)
+    {
+        $self->initializer(
+            AI::MXNet::Xavier->new(
+                factor_type => 'in',
+                magnitude   => 2.34
+            )
+        );
+    }
+    if(not $self->initializer->isa('AI::MXNet::FusedRNN'))
+    {
+        $self->initializer(
+            AI::MXNet::FusedRNN->new(
+                init           => $self->initializer,
+                num_hidden     => $self->_num_hidden,
+                num_layers     => $self->_num_layers,
+                mode           => $self->_mode,
+                bidirectional  => $self->_bidirectional,
+                forget_bias    => $self->forget_bias
+            )
+        );
+    }
+    $self->_parameter($self->params->get('parameters', init => $self->initializer));
+    $self->_directions($self->_bidirectional ? [qw/l r/] : ['l']);
+}
+
+
+method state_shape()
+{
+    my $b = @{ $self->_directions };
+    my $n = $self->_mode eq 'lstm' ? 2 : 1;
+    return [([$b*$self->_num_layers, 0, $self->_num_hidden])x$n];
+}
+
+method _gate_names()
+{
+    return {
+        rnn_relu => [''],
+        rnn_tanh => [''],
+        lstm     => [qw/_i _f _c _o/],
+        gru      => [qw/_r _z _o/]
+    }->{ $self->_mode };
+}
+
+method _num_gates()
+{
+    return scalar(@{ $self->_gate_names })
+}
+
+method _slice_weights($arr, $li, $lh)
+{
+    my %args;
+    my @gate_names = @{ $self->_gate_names };
+    my @directions = @{ $self->_directions };
+
+    my $b = @directions;
+    my $p = 0;
+    for my $layer (0..$self->_num_layers-1)
+    {
+        for my $direction (@directions)
+        {
+            for my $gate (@gate_names)
+            {
+                my $name = sprintf('%s%s%d_i2h%s_weight', $self->_prefix, $direction, $layer, $gate);
+                my $size;
+                if($layer > 0)
+                {
+                    $size = $b*$lh*$lh;
+                    $args{$name} = $arr->slice([$p,$p+$size-1])->reshape([$lh, $b*$lh]);
+                }
+                else
+                {
+                    $size = $li*$lh;
+                    $args{$name} = $arr->slice([$p,$p+$size-1])->reshape([$lh, $li]);
+                }
+                $p += $size;
+            }
+            for my $gate (@gate_names)
+            {
+                my $name = sprintf('%s%s%d_h2h%s_weight', $self->_prefix, $direction, $layer, $gate);
+                my $size = $lh**2;
+                $args{$name} = $arr->slice([$p,$p+$size-1])->reshape([$lh, $lh]);
+                $p += $size;
+            }
+        }
+    }
+    for my $layer (0..$self->_num_layers-1)
+    {
+        for my $direction (@directions)
+        {
+            for my $gate (@gate_names)
+            {
+                my $name = sprintf('%s%s%d_i2h%s_bias', $self->_prefix, $direction, $layer, $gate);
+                $args{$name} = $arr->slice([$p,$p+$lh-1]);
+                $p += $lh;
+            }
+            for my $gate (@gate_names)
+            {
+                my $name = sprintf('%s%s%d_h2h%s_bias', $self->_prefix, $direction, $layer, $gate);
+                $args{$name} = $arr->slice([$p,$p+$lh-1]);
+                $p += $lh;
+            }
+        }
+    }
+    assert($p == $arr->size, "Invalid parameters size for FusedRNNCell");
+    return %args;
+}
+
+method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    my %args = %{ $args };
+    my $arr = delete $args{ $self->_parameter->name };
+    my $b = @{ $self->_directions };
+    my $m = $self->_num_gates;
+    my $h = $self->_num_hidden;
+    my $num_input = int(int(int($arr->size/$b)/$h)/$m) - ($self->_num_layers - 1)*($h+$b*$h+2) - $h - 2;
+    my %nargs = $self->_slice_weights($arr, $num_input, $self->_num_hidden);
+    %args = (%args, map { $_ => $nargs{$_}->copy } keys %nargs);
+    return \%args
+}
+
+method pack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    my %args = %{ $args };
+    my $b = @{ $self->_directions };
+    my $m = $self->_num_gates;
+    my @c = @{ $self->_gate_names };
+    my $h = $self->_num_hidden;
+    my $w0 = $args{ sprintf('%sl0_i2h%s_weight', $self->_prefix, $c[0]) };
+    my $num_input = $w0->shape->[1];
+    my $total = ($num_input+$h+2)*$h*$m*$b + ($self->_num_layers-1)*$m*$h*($h+$b*$h+2)*$b;
+    my $arr = AI::MXNet::NDArray->zeros([$total], ctx => $w0->context, dtype => $w0->dtype);
+    my %nargs = $self->_slice_weights($arr, $num_input, $h);
+    while(my ($name, $nd) = each %nargs)
+    {
+        $nd .= delete $args{ $name };
+    }
+    $args{ $self->_parameter->name } = $arr;
+    return \%args;
+}
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    confess("AI::MXNet::RNN::FusedCell cannot be stepped. Please use unroll");
+}
+
+method unroll(
+    Int $length,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
+    Str                                                  :$input_prefix='',
+    Str                                                  :$layout='NTC',
+    Maybe[Bool]                                          :$merge_outputs=
+)
+{
+    $self->reset;
+    my $axis = index($layout, 'T');
+    $inputs //= AI::MXNet::Symbol->Variable("${input_prefix}data");
+    if(blessed($inputs))
+    {
+        assert(
+            (@{ $inputs->list_outputs() } == 1),
+            "unroll doesn't allow grouped symbol as input. Please "
+            ."convert to list first or let unroll handle slicing"
+        );
+        if($axis == 1)
+        {
+            AI::MXNet::Logging->warning(
+                "NTC layout detected. Consider using "
+                ."TNC for RNN::FusedCell for faster speed"
+            );
+            $inputs = AI::MXNet::Symbol->SwapAxis($inputs, dim1 => 0, dim2 => 1);
+        }
+        else
+        {
+            assert($axis == 0, "Unsupported layout $layout");
+        }
+    }
+    else
+    {
+        assert(@$inputs == $length);
+        $inputs = [map { AI::MXNet::Symbol->expand_dims($_, axis => 0) } @{ $inputs }];
+        $inputs = AI::MXNet::Symbol->Concat(@{ $inputs }, dim => 0);
+    }
+    $begin_state //= $self->begin_state;
+    my $states = $begin_state;
+    my @states = @{ $states };
+    my %states;
+    if($self->_mode eq 'lstm')
+    {
+        %states = (state => $states[0], state_cell => $states[1]);
+    }
+    else
+    {
+        %states = (state => $states[0]);
+    }
+    my $rnn = AI::MXNet::Symbol->RNN(
+        data          => $inputs,
+        parameters    => $self->_parameter,
+        state_size    => $self->_num_hidden,
+        num_layers    => $self->_num_layers,
+        bidirectional => $self->_bidirectional,
+        p             => $self->_dropout,
+        state_outputs => $self->_get_next_state,
+        mode          => $self->_mode,
+        name          => $self->_prefix.'rnn',
+        %states
+    );
+
+    my $outputs;
+    if(not $self->_get_next_state)
+    {
+        ($outputs, $states) = ($rnn, []);
+    }
+    elsif($self->_mode eq 'lstm')
+    {
+        my @rnn = @{ $rnn };
+        ($outputs, $states) = ($rnn[0], [$rnn[1], $rnn[2]]);
+    }
+    else
+    {
+        my @rnn = @{ $rnn };
+        ($outputs, $states) = ($rnn[0], [$rnn[1]]);
+    }
+    if(defined $merge_outputs and not $merge_outputs)
+    {
+        AI::MXNet::Logging->warning(
+            "Call RNN::FusedCell->unroll with merge_outputs=1 "
+            ."for faster speed"
+        );
+        $outputs = [@ {
+            AI::MXNet::Symbol->SliceChannel(
+                $outputs,
+                axis         => 0,
+                num_outputs  => $length,
+                squeeze_axis => 1
+            )
+        }];
+    }
+    elsif($axis == 1)
+    {
+        $outputs = AI::MXNet::Symbol->SwapAxis($outputs, dim1 => 0, dim2 => 1);
+    }
+    return ($outputs, $states);
+}
+
+=head2 unfuse
+
+Unfuse the fused RNN
+
+Returns
+-------
+$cell : AI::MXNet::RNN::SequentialCell
+    unfused cell that can be used for stepping, and can run on CPU.
+=cut
+
+method unfuse()
+{
+    my $stack = AI::MXNet::RNN::SequentialCell->new;
+    my $get_cell = {
+        rnn_relu => sub {
+            AI::MXNet::RNN::Cell->new(
+                num_hidden => $self->_num_hidden,
+                activation => 'relu',
+                prefix     => shift
+            )
+        },
+        rnn_tanh => sub {
+            AI::MXNet::RNN::Cell->new(
+                num_hidden => $self->_num_hidden,
+                activation => 'tanh',
+                prefix     => shift
+            )
+        },
+        lstm     => sub {
+            AI::MXNet::RNN::LSTMCell->new(
+                num_hidden => $self->_num_hidden,
+                prefix     => shift
+            )
+        },
+        gru      => sub {
+            AI::MXNet::RNN::GRUCell->new(
+                num_hidden => $self->_num_hidden,
+                prefix     => shift
+            )
+        },
+    }->{ $self->_mode };
+    for my $i (0..$self->_num_layers-1)
+    {
+        if($self->_bidirectional)
+        {
+            $stack->add(
+                AI::MXNet::RNN::BidirectionalCell->new(
+                    $get_cell->(sprintf('%sl%d_', $self->_prefix, $i)),
+                    $get_cell->(sprintf('%sr%d_', $self->_prefix, $i)),
+                    output_prefix => sprintf('%sbi_%s_%d', $self->_prefix, $self->_mode, $i)
+                )
+            );
+        }
+        else
+        {
+            $stack->add($get_cell->(sprintf('%sl%d_', $self->_prefix, $i)));
+        }
+    }
+    return $stack;
+}
+
+package AI::MXNet::RNN::SequentialCell;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RNN::Cell::Base';
+
+=head1 NAME
+
+AI:MXNet::RNN::SequentialCell
+=cut
+
+=head1 DESCRIPTION
+
+Sequentially stacking multiple RNN cells
+
+Parameters
+----------
+params : AI::MXNet::RNN::Params or undef
+    container for weight sharing between cells.
+created if undef.
+=cut
+
+has [qw/_override_cell_params _cells/] => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my ($self, $original_arguments) = @_;
+    $self->_override_cell_params(defined $original_arguments->{params});
+    $self->_cells([]);
+}
+
+=head2 add
+
+Append a cell into the stack.
+
+Parameters
+----------
+$cell : AI::MXNet::RNN::Cell::Base
+=cut
+
+method add(AI::MXNet::RNN::Cell::Base $cell)
+{
+    push @{ $self->_cells }, $cell;
+    if($self->_override_cell_params)
+    {
+        assert(
+            $cell->_own_params,
+            "Either specify params for SequentialRNNCell "
+            ."or child cells, not both."
+        );
+        %{ $cell->params->_params } = (%{ $cell->params->_params }, %{ $self->params->_params });
+    }
+    %{ $self->params->_params } = (%{ $self->params->_params }, %{ $cell->params->_params });
+}
+
+method state_shape()
+{
+    return $self->_cells_state_shape($self->_cells);
+}
+
+method begin_state(@kwargs)
+{
+    assert(
+        (not $self->_modified),
+        "After applying modifier cells (e.g. DropoutCell) the base "
+        ."cell cannot be called directly. Call the modifier cell instead."
+    );
+    return $self->_cells_begin_state($self->_cells, @kwargs);
+}
+
+method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    return $self->_cells_unpack_weights($self->_cells, $args)
+}
+
+method pack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    return $self->_cells_pack_weights($self->_cells, $args);
+}
+
+method call($inputs, $states)
+{
+    $self->_counter($self->_counter + 1);
+    my @next_states;
+    my $p = 0;
+    for my $cell (@{ $self->_cells })
+    {
+        assert(not $cell->isa('AI::MXNet::BidirectionalCell'));
+        my $n = scalar(@{ $cell->state_shape });
+        my $state = [@{ $states }[$p..$p+$n-1]];
+        $p += $n;
+        ($inputs, $state) = &{$cell}($inputs, $state);
+        push @next_states, $state;
+    }
+    return ($inputs, [map { @$_} @next_states]);
+}
+
+method unroll(
+    Int $length,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
+    Str                                                  :$input_prefix='',
+    Str                                                  :$layout='NTC',
+    Maybe[Bool]                                          :$merge_outputs=
+)
+{
+    my $num_cells = @{ $self->_cells };
+    $begin_state //= $self->begin_state;
+    my $p = 0;
+    my $states;
+    enumerate(sub {
+        my ($i, $cell) = @_;
+        my $n   = @{ $cell->state_shape };
+        $states = [@{$begin_state}[$p..$p+$n-1]];
+        $p += $n;
+        ($inputs, $states) = $cell->unroll(
+            $length,
+            inputs          => $inputs,
+            input_prefix    => $input_prefix,
+            begin_state     => $states,
+            layout          => $layout,
+            merge_outputs   => ($i < $num_cells-1) ? undef : $merge_outputs
+        );
+    }, $self->_cells);
+    return ($inputs, $states);
+}
+
+package AI::MXNet::RNN::BidirectionalCell;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RNN::Cell::Base';
+
+=head1 NAME
+
+AI::MXNet::RNN::BidirectionalCell
+=cut
+
+=head1 DESCRIPTION
+
+Bidirectional RNN cell
+
+Parameters
+----------
+l_cell : AI::MXNet::RNN::Cell::Base
+    cell for forward unrolling
+r_cell : AI::MXNet::RNN::Cell::Base
+    cell for backward unrolling
+output_prefix : str, default 'bi_'
+    prefix for name of output
+=cut
+
+has 'l_cell'         => (is => 'ro', isa => 'AI::MXNet::RNN::Cell::Base', required => 1);
+has 'r_cell'         => (is => 'ro', isa => 'AI::MXNet::RNN::Cell::Base', required => 1);
+has '_output_prefix' => (is => 'ro', init_arg => 'output_prefix', isa => 'Str', default => 'bi_');
+has [qw/_override_cell_params _cells/] => (is => 'rw', init_arg => undef);
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_ >= 2 and blessed $_[0] and blessed $_[1])
+    {
+        my $l_cell = shift(@_);
+        my $r_cell = shift(@_);
+        return $class->$orig(
+            l_cell => $l_cell,
+            r_cell => $r_cell,
+            @_
+        );
+    }
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my ($self, $original_arguments) = @_;
+    $self->_override_cell_params(defined $original_arguments->{params});
+    $self->_cells([$self->l_cell, $self->r_cell]);
+}
+
+method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    return $self->_cells_unpack_weights($self->_cells, $args)
+}
+
+method pack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    return $self->_cells_pack_weights($self->_cells, $args);
+}
+
+method call($inputs, $states)
+{
+    confess("Bidirectional cannot be stepped. Please use unroll");
+}
+
+method state_shape()
+{
+    return $self->_cells_state_shape($self->_cells);
+}
+
+method begin_state(@kwargs)
+{
+    assert((not $self->_modified),
+            "After applying modifier cells (e.g. DropoutCell) the base "
+            ."cell cannot be called directly. Call the modifier cell instead."
+    );
+    return $self->_cells_begin_state($self->_cells, @kwargs);
+}
+
+method unroll(
+    Int $length,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$inputs=,
+    Maybe[AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]] :$begin_state=,
+    Str                                                  :$input_prefix='',
+    Str                                                  :$layout='NTC',
+    Maybe[Bool]                                          :$merge_outputs=
+)
+{
+
+    my $axis = index($layout, 'T');
+    if(not defined $inputs)
+    {
+        $inputs = [
+            map { AI::MXNet::Symbol->Variable("${input_prefix}t${_}_data") } (0..$length-1)
+        ];
+    }
+    elsif(blessed($inputs))
+    {
+        assert(
+            (@{ $inputs->list_outputs() } == 1),
+            "unroll doesn't allow grouped symbol as input. Please "
+            ."convert to list first or let unroll handle slicing"
+        );
+        $inputs = [ @{ AI::MXNet::Symbol->SliceChannel(
+            $inputs,
+            axis         => $axis,
+            num_outputs  => $length,
+            squeeze_axis => 1
+        ) }];
+    }
+    else
+    {
+        assert(@$inputs == $length);
+    }
+    $begin_state //= $self->begin_state;
+    my $states = $begin_state;
+    my ($l_cell, $r_cell) = @{ $self->_cells };
+    my ($l_outputs, $l_states) = $l_cell->unroll(
+        $length, inputs => $inputs,
+        begin_state     => [@{$states}[0..@{$l_cell->state_shape}-1]],
+        layout          => $layout,
+        merge_outputs   => $merge_outputs
+    );
+    my ($r_outputs, $r_states) = $r_cell->unroll(
+        $length, inputs => [reverse @{$inputs}],
+        begin_state     => [@{$states}[@{$l_cell->state_shape}..@{$states}-1]],
+        layout          => $layout,
+        merge_outputs   => $merge_outputs
+    );
+    if(not defined $merge_outputs)
+    {
+        $merge_outputs = (
+            blessed $l_outputs and $l_outputs->isa('AI::MXNet::Symbol')
+                and
+            blessed $r_outputs and $r_outputs->isa('AI::MXNet::Symbol')
+        );
+        if(not $merge_outputs)
+        {
+            if(blessed $l_outputs and $l_outputs->isa('AI::MXNet::Symbol'))
+            {
+                $l_outputs = [
+                    @{ AI::MXNet::Symbol->SliceChannel(
+                        $l_outputs, axis => $axis,
+                        num_outputs      => $length,
+                        squeeze_axis     => 1
+                    ) }
+                ];
+            }
+            if(blessed $r_outputs and $r_outputs->isa('AI::MXNet::Symbol'))
+            {
+                $r_outputs = [
+                    @{ AI::MXNet::Symbol->SliceChannel(
+                        $r_outputs, axis => $axis,
+                        num_outputs      => $length,
+                        squeeze_axis     => 1
+                    ) }
+                ];
+            }
+        }
+    }
+    if($merge_outputs)
+    {
+        $l_outputs = [@{ $l_outputs }];
+        $r_outputs = [@{ AI::MXNet::Symbol->reverse(blessed $r_outputs ? $r_outputs : @{ $r_outputs }, axis=>$axis) }];
+    }
+    else
+    {
+        $r_outputs = [reverse(@{ $r_outputs })];
+    }
+    my $outputs = [];
+    zip(sub {
+        my ($i, $l_o, $r_o) = @_;
+        push @$outputs, AI::MXNet::Symbol->Concat(
+            $l_o, $r_o, dim=>(1+($merge_outputs?1:0)),
+            name => $merge_outputs
+                        ? sprintf('%sout', $self->_output_prefix)
+                        : sprintf('%st%d', $self->_output_prefix, $i)
+        );
+    }, [0..@{ $l_outputs }-1], [@{ $l_outputs }], [@{ $r_outputs }]);
+    if($merge_outputs)
+    {
+        $outputs = @{ $outputs }[0];
+    }
+    $states = [$l_states, $r_states];
+    return($outputs, $states);
+}
+
+package AI::MXNet::RNN::ModifierCell;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RNN::Cell::Base';
+
+=head1 NAME
+
+AI::MXNet::RNN::ModifierCell
+=cut
+
+=head1 DESCRIPTION
+
+Base class for modifier cells. A modifier
+cell takes a base cell, apply modifications
+on it (e.g. Dropout), and returns a new cell.
+
+After applying modifiers the base cell should
+no longer be called directly. The modifer cell
+should be used instead.
+=cut
+
+has 'base_cell' => (is => 'ro', isa => 'AI::MXNet::RNN::Cell::Base', required => 1);
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_%2)
+    {
+        my $base_cell = shift;
+        return $class->$orig(base_cell => $base_cell, @_);
+    }
+    return $class->$orig(@_);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    $self->base_cell->_modified(1);
+}
+
+method params()
+{
+    $self->_own_params(0);
+    return $self->base_cell->params;
+}
+
+method state_shape()
+{
+    return $self->base_cell->state_shape;
+}
+
+method begin_state(CodeRef :$init_sym=AI::MXNet::Symbol->can('zeros'), @kwargs)
+{
+    assert(
+        (not $self->_modified),
+        "After applying modifier cells (e.g. DropoutCell) the base "
+        ."cell cannot be called directly. Call the modifier cell instead."
+    );
+    $self->base_cell->_modified(0);
+    my $begin_state = $self->base_cell->begin_state(func => $init_sym, @kwargs);
+    $self->base_cell->_modified(1);
+    return $begin_state;
+}
+
+method unpack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    return $self->base_cell->unpack_weights($args)
+}
+
+method pack_weights(HashRef[AI::MXNet::NDArray] $args)
+{
+    return $self->base_cell->pack_weights($args)
+}
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    confess("Not Implemented");
+}
+
+package AI::MXNet::RNN::DropoutCell;
+use Mouse;
+extends 'AI::MXNet::RNN::ModifierCell';
+has [qw/dropout_outputs dropout_states/] => (is => 'ro', isa => 'Num', default => 0);
+
+=head1 NAME
+
+AI::MXNet::RNN::DropoutCell
+=cut
+
+=head1 DESCRIPTION
+
+Apply the dropout on base cell
+=cut
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    my ($output, $states) = &{$self->base_cell}($inputs, $states);
+    if($self->dropout_outputs > 0)
+    {
+        $output = AI::MXNet::Symbol->Dropout(data => $output, p => $self->dropout_outputs);
+    }
+    if($self->dropout_states > 0)
+    {
+        $states = [map { AI::MXNet::Symbol->Dropout(data => $_, p => $self->dropout_states) } @{ $states }];
+    }
+    return ($output, $states);
+}
+
+package AI::MXNet::RNN::ZoneoutCell;
+use Mouse;
+extends 'AI::MXNet::RNN::ModifierCell';
+has [qw/zoneout_outputs zoneout_states/] => (is => 'ro', isa => 'Num', default => 0);
+has 'prev_output' => (is => 'rw', init_arg => undef);
+
+=head1 NAME
+
+AI::MXNet::RNN::ZoneoutCell
+=cut
+
+=head1 DESCRIPTION
+
+Apply Zoneout on base cell
+=cut
+
+method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
+{
+    confess("Not Implemented")
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/IO.pm
new file mode 100644
index 000000000000..b455640fdcba
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/IO.pm
@@ -0,0 +1,295 @@
+package AI::MXNet::RNN::IO;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+
+=encoding UTF-8
+
+=head1 NAME
+
+AI::MXNet::RNN::IO - Functions for constructing recurrent neural networks.
+=cut
+
+=head1 SYNOPSIS
+
+
+=head1 DESCRIPTION
+
+Functions for constructing recurrent neural networks.
+=cut
+
+=head2
+
+Encode sentences and (optionally) build a mapping
+from string tokens to integer indices. Unknown keys
+will be added to vocabulary.
+
+Parameters
+----------
+sentences : array ref of array refs of str
+    A array ref of sentences to encode. Each sentence
+    should be a array ref of string tokens.
+vocab : undef or hash ref of str -> int
+    Optional input Vocabulary
+invalid_label : int, default -1
+    Index for invalid token, like <end-of-sentence>
+invalid_key : str, default '\n'
+    Key for invalid token. Use '\n' for end
+    of sentence by default.
+start_label : int
+    lowest index.
+
+Returns
+-------
+result : array ref of array refs of int
+    encoded sentences
+vocab : hash ref of str -> int
+    result vocabulary
+=cut
+
+
+method encode_sentences(
+    ArrayRef[ArrayRef]  $sentences,
+    Maybe[HashRef]     :$vocab=,
+    Int                :$invalid_label=-1,
+    Str                :$invalid_key="\n",
+    Int                :$start_label=0
+)
+{
+    my $idx = $start_label;
+    my $new_vocab;
+    if(not defined $vocab)
+    {
+        $vocab = { $invalid_key => $invalid_label };
+        $new_vocab = 1;
+    }
+    else
+    {
+        $new_vocab = 0;
+    }
+    my @res;
+    for my $sent (@{ $sentences })
+    {
+        my @coded;
+        for my $word (@{ $sent })
+        {
+            if(not exists $vocab->{ $word })
+            {
+                assert($new_vocab, "Unknown token: $word");
+                if($idx == $invalid_label)
+                {
+                    $idx += 1;
+                }
+                $vocab->{$word} = $idx;
+                $idx += 1;
+            }
+            push @coded, $vocab->{ $word };
+        }
+        push @res, \@coded;
+    }
+    return (\@res, $vocab);
+}
+
+package AI::MXNet::BucketSentenceIter;
+
+=encoding UTF-8
+
+=head1 NAME
+
+AI::MXNet::BucketSentenceIter
+=cut
+
+=head1 SYNOPSIS
+
+
+=head1 DESCRIPTION
+
+Simple bucketing iterator for language model.
+Label for each step is constructed from data of
+next step.
+
+=cut
+
+=head2 new
+
+Parameters
+----------
+sentences : array ref of array refs of int
+    encoded sentences
+batch_size : int
+    batch_size of data
+invalid_label : int, default -1
+    key for invalid label, e.g. <end-of-sentence>
+dtype : str, default 'float32'
+    data type
+buckets : array ref of int
+    size of data buckets. Automatically generated if undef.
+data_name : str, default 'data'
+    name of data
+label_name : str, default 'softmax_label'
+    name of label
+layout : str
+    format of data and label. 'NT' means (batch_size, length)
+    and 'TN' means (length, batch_size).
+=cut
+
+use Mouse;
+use AI::MXNet::Base;
+use List::Util qw(shuffle max);
+extends 'AI::MXNet::DataIter';
+has 'sentences'     => (is => 'ro', isa => 'ArrayRef[ArrayRef]', required => 1);
+has '+batch_size'   => (is => 'ro', isa => 'Int',                required => 1);
+has 'invalid_label' => (is => 'ro', isa => 'Int',   default => -1);
+has 'data_name'     => (is => 'ro', isa => 'Str',   default => 'data');
+has 'label_name'    => (is => 'ro', isa => 'Str',   default => 'softmax_label');
+has 'dtype'         => (is => 'ro', isa => 'Dtype', default => 'float32');
+has 'layout'        => (is => 'ro', isa => 'Str',   default => 'NTC');
+has 'buckets'       => (is => 'rw', isa => 'Maybe[ArrayRef[Int]]');
+has [qw/data nddata ndlabel
+        major_axis default_bucket_key
+        provide_data provide_label
+        idx curr_idx
+    /]              => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    if(not defined $self->buckets)
+    {
+        my @buckets;
+        my $p = pdl([map { scalar(@$_) } @{ $self->sentences }]);
+        enumerate(sub {
+            my ($i, $j) = @_;
+            if($j >= $self->batch_size)
+            {
+                push @buckets, $i;
+            }
+        }, $p->histogram(1,0,$p->max+1)->unpdl);
+        $self->buckets(\@buckets);
+    }
+    @{ $self->buckets } = sort { $a <=> $b } @{ $self->buckets };
+    my $ndiscard = 0;
+    $self->data([map { [] } 0..@{ $self->buckets }-1]);
+    for my $i (0..@{$self->sentences}-1)
+    {
+        my $buck = bisect_left($self->buckets, scalar(@{ $self->sentences->[$i] }));
+        if($buck == @{ $self->buckets })
+        {
+            $ndiscard += 1;
+            next;
+        }
+        my $buff = AI::MXNet::NDArray->full(
+            [$self->buckets->[$buck]],
+            $self->invalid_label,
+            dtype => $self->dtype
+        )->aspdl;
+        $buff->slice([0, @{ $self->sentences->[$i] }-1]) .= pdl($self->sentences->[$i]);
+        push @{ $self->data->[$buck] }, $buff;
+    }
+    $self->data([map { pdl(PDL::Type->new(DTYPE_MX_TO_PDL->{$self->dtype}), $_) } @{$self->data}]);
+    AI::MXNet::Logging->warning("discarded $ndiscard sentences longer than the largest bucket.")
+        if $ndiscard;
+    $self->nddata([]);
+    $self->ndlabel([]);
+    $self->major_axis(index($self->layout, 'N'));
+    $self->default_bucket_key(max(@{ $self->buckets }));
+    my $shape;
+    if($self->major_axis == 0)
+    {
+        $shape = [$self->batch_size, $self->default_bucket_key];
+    }
+    elsif($self->major_axis == 1)
+    {
+        $shape = [$self->default_bucket_key, $self->batch_size];
+    }
+    else
+    {
+        confess("Invalid layout ${\ $self->layout }: Must by NT (batch major) or TN (time major)");
+    }
+    $self->provide_data([
+        AI::MXNet::DataDesc->new(
+            name  => $self->data_name,
+            shape => $shape,
+            dtype => $self->dtype
+        )
+    ]);
+    $self->provide_label([
+        AI::MXNet::DataDesc->new(
+            name  => $self->label_name,
+            shape => $shape,
+            dtype => $self->dtype
+        )
+    ]);
+    $self->idx([]);
+    enumerate(sub {
+        my ($i, $buck) = @_;
+        my $buck_len = $buck->shape->at(-1);
+        for my $j (0..($buck_len - $self->batch_size))
+        {
+            if(not $j%$self->batch_size)
+            {
+                push @{ $self->idx }, [$i, $j];
+            }
+        }
+    }, $self->data);
+    $self->curr_idx(0);
+    $self->reset;
+}
+
+method reset()
+{
+    $self->curr_idx(0);
+    @{ $self->idx } = shuffle(@{ $self->idx });
+    $self->nddata([]);
+    $self->ndlabel([]);
+    for my $buck (@{ $self->data })
+    {
+        $buck = pdl_shuffle($buck);
+        my $label = $buck->zeros;
+        $label->slice([0, -2], 'X')  .= $buck->slice([1, -1], 'X');
+        $label->slice([-1, -1], 'X') .= $self->invalid_label;
+        push @{ $self->nddata }, AI::MXNet::NDArray->array($buck, dtype => $self->dtype);
+        push @{ $self->ndlabel }, AI::MXNet::NDArray->array($label, dtype => $self->dtype);
+    }
+}
+
+method next()
+{
+    return undef if($self->curr_idx == @{ $self->idx });
+    my ($i, $j) = @{ $self->idx->[$self->curr_idx] };
+    $self->curr_idx($self->curr_idx + 1);
+    my ($data, $label);
+    if($self->major_axis == 1)
+    {
+        $data  = $self->nddata->[$i]->slice([$j, $j+$self->batch_size-1])->T;
+        $label = $self->ndlabel->[$i]->slice([$j, $j+$self->batch_size-1])->T;
+    }
+    else
+    {
+        $data = $self->nddata->[$i]->slice([$j, $j+$self->batch_size-1]);
+        $label = $self->ndlabel->[$i]->slice([$j, $j+$self->batch_size-1]);
+    }
+    return AI::MXNet::DataBatch->new(
+        data          => [$data],
+        label         => [$label],
+        bucket_key    => $self->buckets->[$i],
+        pad           => 0,
+        provide_data  => [
+            AI::MXNet::DataDesc->new(
+                name  => $self->data_name,
+                shape => $data->shape,
+                dtype => $self->dtype
+            )
+        ],
+        provide_label => [
+            AI::MXNet::DataDesc->new(
+                name  => $self->label_name,
+                shape => $label->shape,
+                dtype => $self->dtype
+            )
+        ],
+    );
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
new file mode 100644
index 000000000000..823192d76120
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
@@ -0,0 +1,38 @@
+package AI::MXNet::Random;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::NDArray::Base;
+use AI::MXNet::Function::Parameters;
+
+=head2 seed
+
+    Seed the random number generators in mxnet.
+
+    This seed will affect behavior of functions in this module,
+    as well as results from executors that contains Random number
+    such as Dropout operators.
+
+    Parameters
+    ----------
+    seed_state : int
+        The random number seed to set to all devices.
+
+    Notes
+    -----
+    The random number generator of mxnet is by default device specific.
+    This means if you set the same seed, the random number sequence
+    generated from GPU0 can be different from CPU.
+=cut
+
+method seed(Int $seed_state)
+{
+    check_call(AI::MXNetCAPI::RandomSeed($seed_state));
+}
+
+*uniform = sub { my $self = shift;
+    return AI::MXNet::NDArray->_sample_uniform(@_);
+};
+*normal = sub { my $self = shift;
+    return AI::MXNet::NDArray->_sample_normal(@_);
+};
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
new file mode 100644
index 000000000000..79e5c2b34a98
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
@@ -0,0 +1,336 @@
+package AI::MXNet::RecordIO;
+use strict;
+use warnings;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Types;
+use AI::MXNet::Base;
+use Mouse;
+
+=head1 NAME
+
+    AI::MXNet::Function::Parameters - Read/write RecordIO format data
+=cut
+
+=head2 new
+
+Parameters
+----------
+uri : Str
+        uri path to recordIO file.
+flag: Str
+        "r" for reading or "w" writing.
+=cut
+
+has 'uri'         => (is => 'ro', isa => 'Str', required => 1);
+has 'flag'        => (is => 'ro', isa => enum([qw/r w/]), required => 1);
+has 'handle'      => (is => 'rw', isa => 'RecordIOHandle');
+has [qw/writable 
+        is_open/] => (is => 'rw', isa => 'Bool');
+
+sub BUILD
+{
+    my $self = shift;
+    $self->is_open(0);
+    $self->open();
+}
+
+sub DEMOLISH
+{
+    shift->close;
+}
+
+=head2 open
+
+Open record file
+=cut
+
+method open()
+{
+    my $handle;
+    if($self->flag eq 'w')
+    {
+        $handle = check_call(AI::MXNetCAPI::RecordIOWriterCreate($self->uri));
+        $self->writable(1);
+    }
+    else
+    {
+        $handle = check_call(AI::MXNetCAPI::RecordIOReaderCreate($self->uri));
+        $self->writable(0);
+    }
+    $self->handle($handle);
+    $self->is_open(1);
+}
+
+=head2 close
+
+Close record file
+=cut
+
+method close()
+{
+    return if not $self->is_open;
+    if($self->writable)
+    {
+        check_call(AI::MXNetCAPI::RecordIOWriterFree($self->handle));
+    }
+    else
+    {
+        check_call(AI::MXNetCAPI::RecordIOReaderFree($self->handle));
+    }
+    $self->is_open(0);
+}
+
+=head2 reset
+
+Reset pointer to first item. If record is opened with 'w',
+this will truncate the file to empty.
+=cut
+
+method reset()
+{
+    $self->close;
+    $self->open;
+}
+
+=head2 write
+
+Write a string buffer as a record
+
+Parameters
+----------
+$buf : buffer to write.
+=cut
+
+method write(Str $buf)
+{
+    assert($self->writable);
+    check_call(
+        AI::MXNetCAPI::RecordIOWriterWriteRecord(
+            $self->handle,
+            $buf,
+            length($buf)
+        )
+    );
+}
+
+=head2 read
+
+Read a record as string
+
+Returns
+----------
+$buf : string
+=cut
+
+method read()
+{
+    assert(not $self->writable);
+    return scalar(check_call(
+        AI::MXNetCAPI::RecordIOReaderReadRecord(
+            $self->handle,
+        )
+    ));
+}
+
+method MXRecordIO(@args) { return AI::MXNet::RecordIO->new(uri => $args[0], flag => $args[1]) }
+method MXIndexedRecordIO(@args)
+{
+    return AI::MXNet::IndexedRecordIO->new(
+        idx_path => $args[0], uri => $args[1], flag => $args[2]
+    )
+}
+
+package AI::MXNet::IRHeader;
+use Mouse;
+has [qw/flag id id2/] => (is => 'rw', isa => 'Int');
+has 'label'           => (is => 'rw', isa => 'AcceptableInput');
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_ == 4)
+    {
+        return $class->$orig(flag => $_[0], label => $_[1], id => $_[2], id2 => $_[3]);
+    }
+    return $class->$orig(@_);
+};
+my @order = qw/flag label id id2/;
+use overload '@{}' => sub { my $self = shift; [map { $self->$_ } @order] };
+
+package AI::MXNet::RecordIO;
+
+=head2 unpack
+
+unpack a MXImageRecord to a string
+
+Parameters
+----------
+s : str
+string buffer from MXRecordIO.read
+
+Returns
+-------
+header : AI::MXNet::IRHeader
+header of the image record
+s : str
+unpacked string
+=cut
+
+method unpack(Str $s)
+{
+    my $h;
+    my $h_size = 24;
+    ($h, $s) = (substr($s, 0, $h_size), substr($s, $h_size));
+    my $header = AI::MXNet::IRHeader->new(unpack('IfQQ', $h));
+    if($header->flag > 0)
+    {
+        my $label;
+        ($label, $s) = (substr($s, 0, 4*$header->flag), substr($s, 4*$header->flag));
+        my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{float32});
+        my $pdl = PDL->new_from_specification($pdl_type, $header->flag);
+        ${$pdl->get_dataref} = $label;
+        $pdl->upd_data;
+        $header->label($pdl);
+    }
+    return ($header, $s)
+}
+
+=head2 pack
+
+pack a string into MXImageRecord
+
+Parameters
+----------
+$header : AI::MXNet::IRHeader or ArrayRef suitable for AI::MXNet::IRHeader->new(@{ ArrayRef })
+header of the image record.
+$header->label can be a number or an array ref.
+s : str
+string to pack
+=cut
+
+method pack(AI::MXNet::IRHeader|ArrayRef $header, Str $s)
+{
+    $header = AI::MXNet::IRHeader->new(@$header) unless blessed $header;
+    if(not ref $header->label)
+    {
+        $header->flag(0);
+    }
+    else
+    {
+        my $label = AI::MXNet::NDArray->array($header->label, dtype=>'float32')->aspdl;
+        $header->label(0);
+        $header->flag($label->nelem);
+        my $buf = ${$label->get_dataref};
+        $s = "$buf$s";
+    }
+    $s = pack('IfQQ', @{ $header }) . $s;
+    return $s;
+}
+
+package AI::MXNet::IndexedRecordIO;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::RecordIO';
+
+=head1 NAME
+
+AI::MXNet::IndexedRecordIO - Read/write RecordIO format data supporting random access.
+=cut
+
+=head2 new
+
+Parameters
+----------
+idx_path : str
+    Path to index file
+uri : str
+    Path to record file. Only support file types that are seekable.
+flag : str
+    'w' for write or 'r' for read
+=cut
+
+has 'idx_path'  => (is => 'ro', isa => 'Str', required => 1);
+has [qw/idx
+    keys fidx/] => (is => 'rw', init_arg => undef);
+
+method open()
+{
+    $self->SUPER::open();
+    $self->idx({});
+    $self->keys([]);
+    open(my $f, $self->flag eq 'r' ? '<' : '>', $self->idx_path);
+    $self->fidx($f);
+    if(not $self->writable)
+    {
+        while(<$f>)
+        {
+            chomp;
+            my ($key, $val) = split(/\t/);
+            push @{ $self->keys }, $key;
+            $self->idx->{$key} = $val;
+        }
+    }
+}
+
+method close()
+{
+    return if not $self->is_open;
+    $self->SUPER::close();
+    $self->fidx(undef);
+}
+
+=head2 seek
+
+Query current read head position
+=cut
+
+method seek(Int $idx)
+{
+    assert(not $self->writable);
+    my $pos = $self->idx->{$idx};
+    check_call(AI::MXNetCAPI::RecordIOReaderSeek($self->handle, $pos));
+}
+
+=head2 tell
+
+Query current write head position
+=cut
+
+method tell()
+{
+    assert($self->writable);
+    return scalar(check_call(AI::MXNetCAPI::RecordIOWriterTell($self->handle)));
+}
+
+=head2 read_idx
+
+Read record with index
+
+Parameters:
+$idx
+=cut
+
+method read_idx(Int $idx)
+{
+    $self->seek($idx);
+    return $self->read();
+}
+
+=head2 write_idx
+
+Write record with index
+
+Parameters:
+$idx, $buf
+=cut
+
+method write_idx(Int $idx, Str $buf)
+{
+    my $pos = $self->tell();
+    $self->write($buf);
+    my $f = $self->fidx;
+    print $f "$idx\t$pos\n";
+    $self->idx->{$idx} = $pos;
+    push @{ $self->keys }, $idx;
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Rtc.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Rtc.pm
new file mode 100644
index 000000000000..92edcaf2b8c0
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Rtc.pm
@@ -0,0 +1,127 @@
+package AI::MXNet::Rtc;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 DESCRIPTION
+
+    Interface to runtime cuda kernel compile module.
+=cut
+
+=head2 Constructor
+
+    MXRtc object in mxnet.
+    This class allow you to write cuda kernel in perl
+    and call them with NDArray.
+
+    Parameters
+    ----------
+    name : str
+        name of the kernel
+    inputs : tuple of (str, mxnet.ndarray)
+        list of input names and ndarray
+    outputs : tuple of (str, mxnet.ndarray)
+        list of output names and ndarray
+    kernel : str
+        the actual kernel code.
+        Note that this is only the body of the kernel, i.e.
+        after { and before }. Rtc will decorate the kernel.
+        For example, if name = "mykernel" and
+        inputs = [('x', mx.nd.zeros((10,)))]
+        outputs = [('y', mx.nd.zeros((10,)))]
+        kernel = "y[threadIdx.x] = x[threadIdx.x];",
+        the kernel that is compile will be:
+        extern "C" __global__ mykernel(float *x, float *y) {
+            const int x_ndim = 1;
+            const int x_dims = { 10 };
+            const int y_ndim = 1;
+            const int y_dims = { 10 };
+
+            y[threadIdx.x] = x[threadIdx.x];
+        }
+=cut
+
+has 'handle'              => (is => 'rw', isa => 'RtcHandle', init_arg => undef);
+has [qw/name kernel/]     => (is => 'ro', isa => 'Str', required => 1);
+has [qw/inputs outputs/]  => (is => 'ro', isa => 'HashRef[AI::MXNet::NDArray]', required => 1);
+
+sub BUILD
+{
+    my $self = shift;
+    my (@input_names, @output_names, @input_nds, @output_nds);
+    while(my ($name, $arr) = each %{ $self->inputs })
+    {
+        push @input_names, $name;
+        push @input_nds, $arr->handle;
+    }
+    while(my ($name, $arr) = each %{ $self->outputs })
+    {
+        push @output_names, $name;
+        push @output_nds, $arr->handle;
+    }
+    my $handle = check_call(
+        AI::MXNetCAPI::RtcCreate(
+            $self->name,
+            scalar(@input_names),
+            scalar(@output_names),
+            \@input_names,
+            \@output_names,
+            \@input_nds,
+            \@output_nds,
+            $self->kernel
+        )
+    );
+    $self->handle($handle);
+}
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::MXRtcFree(shift->handle));
+}
+
+=head2 push
+
+        run the kernel.
+
+        Parameters
+        ----------
+        inputs : list of ndarray
+            list of input. Can be different ndarray then uses for constructor,
+            but must have the same shape and in the same order.
+        outputs : list of ndarray
+            list of out. Can be different ndarray then uses for constructor,
+            but must have the same shape and in the same order.
+        grid_dims : tuple of 3 uint
+            grid dimension for kernel launch
+        block_dims : tuple of 3 uint
+            block dimension for kernel launch
+=cut
+
+
+method push(
+    ArrayRef[AI::MXNet::NDArray] $inputs,
+    ArrayRef[AI::MXNet::NDArray] $outputs,
+    ArrayRef[DimSize] $grid_dims,
+    ArrayRef[DimSize] $block_dims
+)
+{
+    confess("grid_dims must be size of 3")
+        unless @{ $grid_dims } == 3;
+    confess("block_dims must be size of 3")
+        unless @{ $block_dims } == 3;
+    check_call(
+        AI::MXNetCAPI::RtcPush(
+            $self->handle,
+            scalar(@$inputs),
+            scalar(@$outputs),
+            [map { $_->handle } @$inputs],
+            [map { $_->handle } @$outputs],
+            @{ $grid_dims },
+            @{ $block_dims }
+        )
+    );
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
new file mode 100644
index 000000000000..d86e9dfc03b0
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -0,0 +1,1333 @@
+package AI::MXNet::Symbol;
+
+=head1 NAME
+
+AI::MXNet::Symbol - Symbolic interface of MXNet.
+=cut
+
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Symbol::Base;
+use AI::MXNet::Types;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+use overload
+    '""'  => \&stringify,
+    '+'   => \&add,
+    '-'   => \&subtract,
+    '*'   => \&multiply,
+    '/'   => \&divide,
+    '/='  => \&idivide,
+    '**'  => \&power,
+    '=='  => \&equal,
+    '!='  => \&not_equal,
+    '>'   => \&greater,
+    '>='  => \&greater_equal,
+    '<'   => \&lesser,
+    '<='  => \&lesser_equal,
+    '&{}' => sub { my $self = shift; sub { $self->call(@_) } },
+    '@{}' => sub { my $self = shift; [map { $self->slice($_) } @{ $self->list_outputs }] };
+
+extends 'AI::MXNet::Symbol::Base';
+has 'handle'   => (is => 'rw', isa => 'SymbolHandle', required => 1);
+
+sub DEMOLISH
+{
+    check_call(AI::NNVMCAPI::SymbolFree(shift->handle));
+}
+
+method STORABLE_freeze($cloning)
+{
+    return $self->tojson();
+}
+
+method STORABLE_thaw($cloning, $json)
+{
+    my $handle = check_call(
+        AI::MXNetCAPI::SymbolCreateFromJSON(
+            $json
+        )
+    );
+    $self->handle($handle);
+}
+
+method stringify($other=, $reverse=)
+{
+    my $name = $self->name;
+    sprintf("<%s %s>", ref($self), $name ? $name : 'Grouped');
+}
+
+method add(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Plus _PlusScalar/
+    );
+}
+
+method subtract(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Minus _MinusScalar _RMinusScalar/,
+        $reverse
+    );
+}
+
+method multiply(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Mul _MulScalar/
+    );
+}
+
+method divide(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Div _DivScalar _RDivScalar/,
+        $reverse
+    );
+}
+
+method power(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Power _PowerScalar _RPowerScalar/,
+        $reverse
+    );
+}
+
+method equal(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_equal _equal_scalar/
+    );
+}
+
+method not_equal(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_not_equal _not_equal_scalar/
+    );
+}
+
+method greater(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_greater _greater_scalar _lesser_scalar/,
+        $reverse
+    );
+}
+
+method greater_equal(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_greater_equal _greater_equal_scalar _lesser_equal_scalar/,
+        $reverse
+    );
+}
+
+method lesser(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_lesser _lesser_scalar _greater_scalar/,
+        $reverse
+    );
+}
+
+method lesser_equal(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_lesser_equal _lesser_equal_scalar _greater_equal_scalar/,
+        $reverse
+    );
+}
+
+method true_divide(AI::MXNet::Symbol|Num $other, $reverse=)
+{
+    return $self->divide($other, $reverse);
+}
+
+method maximum(AI::MXNet::Symbol|Num $other)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Maximum _MaximumScalar/
+    );
+}
+
+method minimum(AI::MXNet::Symbol|Num $other)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Minimum _MinimumScalar/
+    );
+}
+
+method hypot(AI::MXNet::Symbol|Num $other)
+{
+    return _ufunc_helper(
+        $self,
+        $other,
+        qw/_Hypot _HypotScalar/
+    );
+}
+
+method deepcopy()
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolCopy($self->handle));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+method call(@args)
+{
+    my $s = $self->deepcopy();
+    $s->_compose(@args);
+    return $s;
+}
+
+method slice(Str|Index $index)
+{
+    ## __getitem__ tie needs to die
+    if(not find_type_constraint('Index')->check($index))
+    {
+        my $i = 0;
+        my $idx;
+        for my $name (@{ $self->list_outputs() })
+        {
+            if($name eq $index)
+            {
+                if(defined $idx)
+                {
+                    confess(qq/There are multiple outputs with name "$index"/);
+                }
+                $idx = $i;
+            }
+            $i++;
+        }
+        confess(qq/Cannot find output that matches name "$index"/) unless defined $idx;
+        $index = $idx;
+    }
+    elsif($index >= @{ $self->list_outputs() })
+    {
+        confess("Index: [$index] is outside of the range of the symbol: $self outputs");
+    }
+    my $handle = check_call(AI::MXNetCAPI::SymbolGetOutput($self->handle, $index));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+=head2 name
+
+Get name string from the symbol, this function only works for non-grouped symbol.
+
+Returns
+-------
+value : str
+    The name of this symbol, returns None for grouped symbol.
+=cut
+
+method name()
+{
+    my ($name, $success) = check_call(AI::MXNetCAPI::SymbolGetName($self->handle));
+    return $success ? $name : undef;
+}
+
+=head2 attr
+
+Get an attribute string from the symbol, this function only works for non-grouped symbol.
+
+Parameters
+----------
+key : str
+    The key to get attribute from.
+
+Returns
+-------
+value : str
+    The attribute value of the key, returns None if attribute do not exist.
+=cut
+
+
+method attr(Str $key)
+{
+    my ($attr, $success) = check_call(
+        AI::MXNetCAPI::SymbolGetAttr($self->handle, $key)
+    );
+    return $success ? $attr : undef;
+}
+
+=head2 list_attr
+
+Get all attributes from the symbol.
+
+Returns
+-------
+ret : hash ref of str to str
+    a dicitonary mapping attribute keys to values
+=cut
+
+method list_attr()
+{
+    my %ret;
+    my @attrs = @{ check_call(AI::MXNetCAPI::SymbolListAttrShallow($self->handle)) };
+    while(@attrs)
+    {
+        my $k = shift(@attrs);
+        my $v = shift(@attrs);
+        $ret{ $k } = $v;
+    }
+    return \%ret;
+}
+
+=head2 attr_dict
+
+Recursively get all attributes from the symbol and its childrens
+
+Returns
+-------
+ret : hash ref of str to hash ref.
+    Returns a dict whose keys are names of the symbol and its children.
+    Values of the returned dict are dictionaries that map attribute keys to values.
+=cut
+
+method attr_dict()
+{
+    my %ret;
+    my @attrs = @{ check_call(AI::MXNetCAPI::SymbolListAttr($self->handle)) };
+    my $size = @attrs/2;
+    for (my $i = 0; $i < $size; $i++)
+    {
+        my ($name, $key) = split(/\$/, $attrs[$i*2]);
+        my $val = $attrs[$i*2+1];
+        $ret{ $name }{ $key } = $val;
+    }
+    return \%ret;
+}
+
+method _set_attr(Str @args)
+{
+    my %kwargs = @args; 
+    while(my ($key, $val) = each(%kwargs))
+    {
+        check_call(
+            AI::MXNetCAPI::SymbolSetAttr(
+                $self->handle, $key, $val
+            )
+        );
+    }
+}
+
+=head2 get_internals
+
+Get a new grouped symbol whose output contains all the internal outputs of this symbol.
+
+Returns
+-------
+    sgroup : AI::MXNet::Symbol
+    The internal symbol of the symbol.
+=cut
+
+method get_internals()
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolGetInternals($self->handle));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+=head2 get_children
+
+Get a new grouped symbol whose output contains
+inputs to output nodes of the original symbol
+
+Returns
+-------
+sgroup : Symbol or undef
+    The children of the head node. If the symbol has no
+    inputs undef will be returned.
+=cut
+
+
+method get_children()
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolGetChildren($self->handle));
+    my $ret = __PACKAGE__->new(handle => $handle);
+    return undef unless @{ $ret->list_outputs };
+    return $ret;
+}
+
+=head2 list_arguments
+
+List all the arguments in the symbol.
+
+Returns
+-------
+args : array ref of strings
+=cut
+
+method list_arguments()
+{
+    return scalar(check_call(AI::MXNetCAPI::SymbolListArguments($self->handle)));
+}
+
+=head2 list_outputs()
+
+List all outputs in the symbol.
+
+Returns
+-------
+$out : array ref of strings.
+=cut
+
+method list_outputs()
+{
+    return scalar(check_call(AI::MXNetCAPI::SymbolListOutputs($self->handle)));
+}
+
+
+=head2 list_auxiliary_states()
+
+List all auxiliary states in the symbol.
+
+Returns
+-------
+aux_states : array ref of string
+    List the names of the auxiliary states.
+
+    Notes
+    -----
+    Auxiliary states are special states of symbols that do not corresponds to an argument,
+    and do not have gradient. But still be useful for the specific operations.
+    A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
+    Most operators do not have Auxiliary states.
+=cut
+
+method list_auxiliary_states()
+{
+    return scalar(check_call(AI::MXNetCAPI::SymbolListAuxiliaryStates($self->handle)));
+}
+
+
+=head2 infer_type
+
+        Infer the type of outputs and arguments of given known types of arguments.
+
+        User can either pass in the known types in positional way or keyword argument way.
+        Tuple of Nones is returned if there is not enough information passed in.
+        An error will be raised if there is inconsistency found in the known types passed in.
+
+        Parameters
+        ----------
+        args : Array
+            Provide type of arguments in a positional way.
+            Unknown type can be marked as None
+
+        kwargs : Hash ref, must ne ssupplied as as sole argument to the method.
+            Provide keyword arguments of known types.
+
+        Returns
+        -------
+        arg_types : array ref of Dtype or undef
+            List of types of arguments.
+            The order is in the same order as list_arguments()
+        out_types : array ref of Dtype or undef
+            List of types of outputs.
+            The order is in the same order as list_outputs()
+        aux_types : array ref of Dtype or undef
+            List of types of outputs.
+            The order is in the same order as list_auxiliary()
+=cut
+
+
+method infer_type(Str|Undef @args)
+{
+    my ($positional_arguments, $kwargs, $kwargs_order) = _parse_arguments("Dtype", @args); 
+    my $sdata = [];
+    my $keys  = [];
+    if(@$positional_arguments)
+    {
+        @{ $sdata } = map { defined($_) ? DTYPE_STR_TO_MX->{ $_ } : -1 } @{ $positional_arguments };
+    }
+    else
+    {
+        @{ $keys }  = @{ $kwargs_order };
+        @{ $sdata } = map { DTYPE_STR_TO_MX->{ $_ } } @{ $kwargs }{ @{ $kwargs_order } };
+    }
+    my ($arg_type, $out_type, $aux_type, $complete) = check_call(AI::MXNetCAPI::SymbolInferType(
+            $self->handle,
+            scalar(@{ $sdata }),
+            $keys,
+            $sdata
+        )
+    );
+    if($complete)
+    {
+        return (
+            [ map { DTYPE_MX_TO_STR->{ $_ } } @{ $arg_type }],
+            [ map { DTYPE_MX_TO_STR->{ $_ } } @{ $out_type }],
+            [ map { DTYPE_MX_TO_STR->{ $_ } } @{ $aux_type }]
+        );
+    }
+    else
+    {
+        return (undef, undef, undef);
+    }
+}
+
+=head2 infer_shape
+
+        Infer the shape of outputs and arguments of given known shapes of arguments.
+
+        User can either pass in the known shapes in positional way or keyword argument way.
+        Tuple of Nones is returned if there is not enough information passed in.
+        An error will be raised if there is inconsistency found in the known shapes passed in.
+
+        Parameters
+        ----------
+        *args :
+            Provide shape of arguments in a positional way.
+            Unknown shape can be marked as undef
+
+        **kwargs :
+            Provide keyword arguments of known shapes.
+
+        Returns
+        -------
+        arg_shapes : array ref of Shape or undef
+            List of shapes of arguments.
+            The order is in the same order as list_arguments()
+        out_shapes : array ref of Shape or undef
+            List of shapes of outputs.
+            The order is in the same order as list_outputs()
+        aux_shapes : array ref of Shape or undef
+            List of shapes of outputs.
+            The order is in the same order as list_auxiliary()
+=cut
+
+method infer_shape(Maybe[Str|Shape] @args)
+{
+    my @res = $self->_infer_shape_impl(0, @args);
+    if(not defined $res[1])
+    {
+        my ($arg_shapes) = $self->_infer_shape_impl(1, @args);
+        my $arg_names    = $self->list_arguments;
+        my @unknowns;
+        zip(sub {
+            my ($name, $shape) = @_;
+            if(not ref $shape or not @$shape or not product(@$shape))
+            {
+                if(@unknowns >= 10)
+                {
+                    $unknowns[10] = '...';
+                }
+                else
+                {
+                    my @shape = eval { @$shape };
+                    push @unknowns, "$name @shape";
+                }
+            }
+        }, $arg_names, $arg_shapes);
+        AI::MXNet::Logging->warning(
+            "Cannot decide shape for the following arguments "
+            ."(0s in shape means unknown dimensions). "
+            ."Consider providing them as input:\n\t"
+            ."\n\t"
+            .join(", ", @unknowns)
+        );
+    }
+    return @res;
+}
+
+=head2 infer_shape_partial
+
+Partially infer the shape. The same as infer_shape, except that the partial
+results can be returned.
+=cut
+
+method infer_shape_partial(Maybe[Str|Shape] @args)
+{
+    $self->_infer_shape_impl(1, @args)
+}
+
+# The actual implementation for calling shape inference API.
+method _infer_shape_impl(Maybe[Str|Shape] @args)
+{
+    my $partial = shift(@args);
+    my ($positional_arguments, $kwargs, $kwargs_order) = _parse_arguments("Shape", @args);
+    my $sdata = [];
+    my $indptr = [0];
+    my $keys = [];
+    if(@{ $positional_arguments })
+    {
+        for my $shape (grep { defined } @{ $positional_arguments })
+        {
+            push @{ $sdata }, @{ $shape };
+            push @{ $indptr }, scalar(@{ $sdata });
+        }
+    }
+    {
+        for my $k (@{ $kwargs_order })
+        {
+            push @{ $keys }, $k;
+            push @{ $sdata }, @{ $kwargs->{ $k } };
+            push @{ $indptr }, scalar(@{ $sdata });
+        }
+    }
+    my $infer_func = $partial ? \&AI::MXNetCAPI::SymbolInferShapePartial : \&AI::MXNetCAPI::SymbolInferShape;
+    my ($arg_shapes, $out_shapes, $aux_shapes, $complete) = check_call(
+        $infer_func->(
+            $self->handle,
+            scalar(@{ $indptr }) - 1,
+            $keys,
+            $indptr,
+            $sdata,
+        )
+    );
+    if($complete)
+    {
+        return $arg_shapes, $out_shapes, $aux_shapes;
+    }
+    else
+    {
+        return (undef, undef, undef);
+    }
+}
+
+=head2 debug_str
+
+The debug string.
+
+Returns
+-------
+debug_str : string
+    Debug string of the symbol.
+=cut
+
+method debug_str()
+{
+    return scalar(check_call(AI::MXNetCAPI::SymbolPrint($self->handle)));
+}
+
+=head2 save
+
+        Save the symbol into a file.
+
+        You can also use Storable to do the job if you only work on Perl.
+        The advantage of load/save is the file is language agnostic.
+        This means the file saved using save can be loaded by other language binding of mxnet.
+        You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
+
+        Parameters
+        ----------
+        fname : str
+            The name of the file
+            - s3://my-bucket/path/my-s3-symbol
+            - hdfs://my-bucket/path/my-hdfs-symbol
+            - /path-to/my-local-symbol
+
+        See Also
+        --------
+        load : Used to load symbol from file.
+=cut
+
+method save(Str $fname)
+{
+    check_call(AI::MXNetCAPI::SymbolSaveToFile($self->handle, $fname));
+}
+
+=head2 tojson
+
+        Save the symbol into a JSON string.
+
+        See Also
+        --------
+        load_json : Used to load symbol from JSON string.
+=cut
+
+method tojson()
+{
+    return scalar(check_call(AI::MXNetCAPI::SymbolSaveToJSON($self->handle)));
+}
+
+method _get_ndarray_inputs(
+    Str                                                      $arg_key,
+    HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray] $args,
+    ArrayRef[Str]                                            $arg_names,
+    Bool                                                     $allow_missing=0
+)
+{
+    my ($arg_handles, $arg_arrays) = ([], []);
+    if(ref $args eq 'ARRAY')
+    {
+        confess("Length of $arg_key do not match number of arguments") 
+            unless @$args == @$arg_names;
+        @{ $arg_handles } = map { $_->handle } @{ $args };
+        $arg_arrays = $args;
+    }
+    else
+    {
+        my %tmp = ((map { $_ => undef } @$arg_names), %$args);
+        if(not $allow_missing and grep { not defined } values %tmp)
+        {
+            my ($missing) = grep { not defined $tmp{ $_ } } (keys %tmp);
+            confess("key $missing is missing in $arg_key");
+        }
+        for my $name (@$arg_names)
+        {
+            push @$arg_handles, defined($tmp{ $name }) ? $tmp{ $name }->handle : undef;
+            push @$arg_arrays, defined($tmp{ $name }) ? $tmp{ $name } : undef;
+        }
+    }
+    return ($arg_handles, $arg_arrays);
+}
+
+=head2 simple_bind
+
+Bind current symbol to get an executor, allocate all the ndarrays needed.
+Allows specifying data types.
+
+This function will ask user to pass in ndarray of position
+they like to bind to, and it will automatically allocate the ndarray
+for arguments and auxiliary states that user did not specify explicitly.
+
+Parameters
+----------
+:$ctx : AI::MXNet::Context
+    The device context the generated executor to run on.
+
+:$grad_req: string
+    {'write', 'add', 'null'}, or list of str or dict of str to str, optional
+    Specifies how we should update the gradient to the args_grad.
+    - 'write' means everytime gradient is write to specified args_grad NDArray.
+    - 'add' means everytime gradient is add to the specified NDArray.
+    - 'null' means no action is taken, the gradient may not be calculated.
+
+:$type_dict  : hash ref of str->Dtype
+    Input type map, name->dtype
+
+:$group2ctx : hash ref of string to AI::MXNet::Context
+    The mapping of the ctx_group attribute to the context assignment.
+
+:$shapes : hash ref of str->Shape
+    Input shape map, name->shape
+
+Returns
+-------
+$executor : AI::MXNet::Executor
+    The generated Executor
+=cut
+
+method simple_bind(
+            AI::MXNet::Context                 :$ctx=AI::MXNet::Context->current_ctx,
+            Maybe[HashRef[Shape]]              :$shapes=,
+            Str|HashRef[Str]                   :$grad_req='write',
+            Maybe[HashRef[Dtype]]              :$type_dict=,
+            Maybe[HashRef[AI::MXNet::Context]] :$group2ctx=
+)
+{
+    $shapes //= {};
+    if(not defined $type_dict)
+    {
+        $type_dict =  {};
+        my $attrs = $self->attr_dict;
+        for my $k (@{ $self->list_arguments })
+        {
+            if(not exists $attrs->{$k} or not exists $attrs->{$k}{__dtype__})
+            {
+                $type_dict->{ $k } = 'float32';
+            }
+        }
+    }
+    my @keys = keys %$shapes;
+    my @shape_input;
+    my @type_input;
+    for my $k (@keys)
+    {
+        push @shape_input, ($k => $shapes->{$k});
+        push @type_input,  ($k => $type_dict->{$k})
+    }
+    my ($arg_shapes, undef, $aux_shapes) = $self->infer_shape(@shape_input);
+    my ($arg_types,  undef, $aux_types)  = $self->infer_type(@type_input);
+    confess("Input node is not complete") 
+        unless $arg_shapes and $arg_types;
+
+    my ($arg_ctx, $aux_ctx) = ([], []);
+    if(defined $group2ctx)
+    {
+        my $attr_dict = $self->attr_dict();
+        for my $name (@{ $self->list_arguments() })
+        {
+            if(
+                exists $attr_dict->{ $name }
+                    and
+                exists $attr_dict->{ $name }{ __ctx_group__ }
+                    and
+                $group2ctx->{ $attr_dict->{ $name }{ __ctx_group__ } }
+            )
+            {
+                push @{ $arg_ctx }, $group2ctx->{ $attr_dict->{ $name }{ __ctx_group__ } };
+            }
+            else
+            {
+                push @{ $arg_ctx }, $ctx;
+            }
+        }
+        for my $name (@{ $self->list_auxiliary_states() })
+        {
+            if(
+                exists $attr_dict->{ $name }
+                    and
+                exists $attr_dict->{ $name }{ __ctx_group__ }
+                    and
+                $group2ctx->{ $attr_dict->{ $name }{ __ctx_group__ } }
+            )
+            {
+                push @{ $aux_ctx }, $group2ctx->{ $attr_dict->{ $name }{ __ctx_group__ } };
+            }
+            else
+            {
+                push @{ $aux_ctx }, $ctx;
+            }
+        }
+    }
+    else
+    {
+        @{ $arg_ctx } = (($ctx) x @{ $arg_shapes });
+        @{ $aux_ctx } = (($ctx) x @{ $aux_shapes });
+    }
+    my @arg_ndarrays;
+    for (my $i = 0; $i < @{ $arg_types }; $i++)
+    {
+        push @arg_ndarrays, AI::MXNet::NDArray->zeros(
+            $arg_shapes->[$i], ctx => $arg_ctx->[$i], dtype => $arg_types->[$i]
+        );
+    }
+    my $grad_ndarrays;
+    if($grad_req ne 'null')
+    {
+        my $names = $self->list_arguments;
+        for (my $i = 0; $i < @{ $arg_types }; $i++)
+        {
+            if(not ref $grad_req eq 'HASH' or not ($grad_req->{ $names->[$i] }//'') eq 'null')
+            {
+                $grad_ndarrays->{ $names->[$i] } = AI::MXNet::NDArray->zeros(
+                    $arg_shapes->[$i], ctx => $arg_ctx->[$i], dtype => $arg_types->[$i]
+                );
+            }
+        }
+    }
+    my @aux_ndarrays;
+    for (my $i = 0; $i < @{ $aux_types }; $i++)
+    {
+        push @aux_ndarrays, AI::MXNet::NDArray->zeros(
+            $aux_shapes->[$i], ctx => $aux_ctx->[$i], dtype => $aux_types->[$i]
+        );
+    }
+    my $executor = $self->bind(
+        ctx => $ctx, args => \@arg_ndarrays, args_grad => $grad_ndarrays,
+        grad_req => $grad_req, aux_states => \@aux_ndarrays, group2ctx => $group2ctx
+    );
+    return $executor;
+}
+
+=head2 bind
+
+Bind current symbol to get an executor.
+
+Parameters
+----------
+:$ctx : AI::MXNet::Context
+    The device context the generated executor to run on.
+
+:$args : HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]
+    Input arguments to the symbol.
+    - If type is array ref of NDArray, the position is in the same order of list_arguments.
+    - If type is hash ref of str to NDArray, then it maps the name of arguments
+    to the corresponding NDArray.
+    - In either case, all the arguments must be provided.
+
+:$args_grad : Maybe[HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]]
+    When specified, args_grad provide NDArrays to hold
+    the result of gradient value in backward.
+    - If type is array ref of NDArray, the position is in the same order of list_arguments.
+    - If type is hash ref of str to NDArray, then it maps the name of arguments
+    to the corresponding NDArray.
+    - When the type is hash ref of str to NDArray, users only need to provide the dict
+    for needed argument gradient.
+    Only the specified argument gradient will be calculated.
+
+:$grad_req : {'write', 'add', 'null'}, or array ref of str or hash ref of str to str, optional
+    Specifies how we should update the gradient to the args_grad.
+    - 'write' means everytime gradient is write to specified args_grad NDArray.
+    - 'add' means everytime gradient is add to the specified NDArray.
+    - 'null' means no action is taken, the gradient may not be calculated.
+
+:$aux_states : array ref of NDArray, or hash ref of str to NDArray, optional
+    Input auxiliary states to the symbol, only need to specify when
+    list_auxiliary_states is not empty.
+    - If type is array ref of NDArray, the position is in the same order of list_auxiliary_states
+    - If type is hash ref of str to NDArray, then it maps the name of auxiliary_states
+    to the corresponding NDArray,
+    - In either case, all the auxiliary_states need to be provided.
+
+:$group2ctx : hash ref of string to AI::MXNet::Context
+    The mapping of the ctx_group attribute to the context assignment.
+
+:$shared_exec : AI::MXNet::Executor
+    Executor to share memory with. This is intended for runtime reshaping, variable length
+    sequences, etc. The returned executor shares state with shared_exec, and should not be
+    used in parallel with it.
+
+Returns
+-------
+$executor : AI::MXNet::Executor
+    The generated Executor
+
+Notes
+-----
+Auxiliary states are special states of symbols that do not corresponds to an argument,
+and do not have gradient. But still be useful for the specific operations.
+A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
+Most operators do not have auxiliary states and this parameter can be safely ignored.
+
+User can give up gradient by using a hash ref in args_grad and only specify
+the gradient they're interested in.
+=cut
+
+method bind(
+        AI::MXNet::Context                                              :$ctx,
+        HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]        :$args,
+        Maybe[HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]] :$args_grad=,
+        Str|HashRef[Str]|ArrayRef[Str]                                  :$grad_req='write',
+        Maybe[HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray]] :$aux_states=,
+        Maybe[HashRef[AI::MXNet::Context]]                              :$group2ctx=,
+        Maybe[AI::MXNet::Executor]                                      :$shared_exec=
+)
+{
+    $grad_req //= 'write';
+    my $listed_arguments = $self->list_arguments();
+    my ($args_handle, $args_grad_handle, $aux_args_handle) = ([], [], []);
+    ($args_handle, $args) = $self->_get_ndarray_inputs('args', $args, $listed_arguments);
+    if(not defined $args_grad)
+    {
+        @$args_grad_handle = ((undef) x (@$args));
+    }
+    else
+    {
+        ($args_grad_handle, $args_grad) = $self->_get_ndarray_inputs(
+                'args_grad', $args_grad, $listed_arguments, 1
+        );
+    }
+
+    if(not defined $aux_states)
+    {
+        $aux_states = [];
+    }
+    ($aux_args_handle, $aux_states) = $self->_get_ndarray_inputs(
+            'aux_states', $aux_states, $self->list_auxiliary_states()
+    );
+
+    # setup requirements
+    my $req_map = { null => 0, write => 1, add =>  3 };
+    my $req_array = [];
+    if(not ref $grad_req)
+    {
+        confess('grad_req must be one of "null,write,add"')
+            unless exists $req_map->{ $grad_req };
+        @{ $req_array } = (($req_map->{ $grad_req }) x @{ $listed_arguments });
+    }
+    elsif(ref $grad_req eq 'ARRAY')
+    {
+        @{ $req_array } = map { $req_map->{ $_ } } @{ $grad_req };
+    }
+    else
+    {
+        for my $name (@{ $listed_arguments })
+        {
+            if(exists $grad_req->{ $name })
+            {
+                push @{ $req_array }, $req_map->{ $grad_req->{ $name } };
+            }
+            else
+            {
+                push @{ $req_array }, 0;
+            }
+        }
+    }
+
+    my $ctx_map_keys = [];
+    my $ctx_map_dev_types = [];
+    my $ctx_map_dev_ids = [];
+
+    if(defined $group2ctx)
+    {
+        while(my ($key, $val) = each %{ $group2ctx })
+        {
+            push @{ $ctx_map_keys } , $key;
+            push @{ $ctx_map_dev_types }, $val->device_type_id;
+            push @{ $ctx_map_dev_ids }, $val->device_id;
+        }
+    }
+    my $shared_handle = $shared_exec->handle if $shared_exec;
+    my $handle = check_call(AI::MXNetCAPI::ExecutorBindEX(
+                $self->handle,
+                $ctx->device_type_id,
+                $ctx->device_id,
+                scalar(@{ $ctx_map_keys }),
+                $ctx_map_keys,
+                $ctx_map_dev_types,
+                $ctx_map_dev_ids,
+                scalar(@{ $args }),
+                $args_handle,
+                $args_grad_handle,
+                $req_array,
+                scalar(@{ $aux_states }),
+                $aux_args_handle,
+                $shared_handle
+            )
+    );
+    my $executor = AI::MXNet::Executor->new(
+        handle    => $handle,
+        symbol    => $self,
+        ctx       => $ctx,
+        grad_req  => $grad_req,
+        group2ctx => $group2ctx
+    );
+    $executor->arg_arrays($args);
+    $executor->grad_arrays($args_grad);
+    $executor->aux_arrays($aux_states);
+    return $executor;
+}
+
+=head2 eval
+
+Evaluate a symbol given arguments
+
+The `eval` method combines a call to `bind` (which returns an executor)
+with a call to `forward` (executor method).
+For the common use case, where you might repeatedly evaluate with same arguments,
+eval is slow.
+In that case, you should call `bind` once and then repeatedly call forward.
+Eval allows simpler syntax for less cumbersome introspection.
+
+Parameters
+----------
+:$ctx : Context
+The device context the generated executor to run on.
+Optional, defaults to cpu(0)
+
+:$args array ref of NDArray or hash ref of NDArray
+
+- If the type is an array ref of NDArray, the position is in the same order of list_arguments.
+- If the type is a hash of str to NDArray, then it maps the name of the argument
+  to the corresponding NDArray.
+- In either case, all arguments must be provided.
+
+Returns
+----------
+result :  an array ref of NDArrays corresponding to the values
+taken by each symbol when evaluated on given args.
+When called on a single symbol (not a group),
+the result will be an array ref with one element.
+
+Examples:
+my $result = $symbol->(ctx => mx->gpu, args => {data => mx->nd->ones([5,5])});
+my $result = $symbol->(args => {data => mx->nd->ones([5,5])});
+
+=cut
+
+method eval(:$ctx=AI::MXNet::Context->cpu, HashRef[AI::MXNet::NDArray]|ArrayRef[AI::MXNet::NDArray] :$args)
+{
+    return $self->bind(ctx => $ctx, args => $args)->forward;
+}
+
+=head2  grad
+
+Get the autodiff of current symbol.
+This function can only be used if current symbol is a loss function.
+
+Parameters
+----------
+$wrt : Array of String
+    keyword arguments of the symbol that the gradients are taken.
+
+Returns
+-------
+grad : AI::MXNet::Symbol
+    A gradient Symbol with returns to be the corresponding gradients.
+=cut
+
+method grad(ArrayRef[Str] $wrt)
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolGrad(
+                    $self->handle,
+                    scalar(@$wrt),
+                    $wrt
+                 )
+    );
+    return __PACKAGE__->new(handle => $handle);
+}
+
+=head2 Variable
+
+    Create a symbolic variable with specified name.
+
+    Parameters
+    ----------
+    name : str
+        Name of the variable.
+    attr : hash ref of string -> string
+        Additional attributes to set on the variable.
+    shape : array ref of positive integers
+        Optionally, one can specify the shape of a variable. This will be used during
+        shape inference. If user specified a different shape for this variable using
+        keyword argument when calling shape inference, this shape information will be ignored.
+    lr_mult : float
+        Specify learning rate muliplier for this variable.
+    wd_mult : float
+        Specify weight decay muliplier for this variable.
+    dtype : Dtype
+        Similar to shape, we can specify dtype for this variable.
+    init : initializer (mx->init->*)
+        Specify initializer for this variable to override the default initializer
+    Returns
+    -------
+    variable : Symbol
+        The created variable symbol.
+=cut
+
+method Variable(
+    Str                            $name,
+    HashRef[Str]                  :$attr={},
+    Maybe[Shape]                  :$shape=,
+    Maybe[Num]                    :$lr_mult=,
+    Maybe[Num]                    :$wd_mult=,
+    Maybe[Dtype]                  :$dtype=,
+    Maybe[AI::MXNet::Initializer] :$init=
+)
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolCreateVariable($name));
+    my $ret = __PACKAGE__->new(handle => $handle);
+    $attr = AI::MXNet::Symbol::AttrScope->current->get($attr);
+    $attr->{__shape__}   = "(".join(',', @{ $shape }).")" if $shape;
+    $attr->{__lr_mult__} =  $lr_mult if defined $lr_mult;
+    $attr->{__wd_mult__} =  $wd_mult if defined $wd_mult;
+    $attr->{__dtype__}   = DTYPE_STR_TO_MX->{ $dtype } if $dtype;
+    $attr->{__init__}    = "$init" if defined $init;
+    $ret->_set_attr(%{ $attr });
+    return $ret;
+}
+
+=head2 Group
+
+    Create a symbol that groups symbols together.
+
+    Parameters
+    ----------
+    symbols : array ref
+        List of symbols to be grouped.
+
+    Returns
+    -------
+    sym : Symbol
+        The created group symbol.
+=cut
+
+method Group(ArrayRef[AI::MXNet::Symbol] $symbols)
+{
+    my @handles = map { $_->handle } @{ $symbols };
+    my $handle = check_call(AI::MXNetCAPI::SymbolCreateGroup(scalar(@handles), \@handles));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+=head2 load
+
+    Load symbol from a JSON file.
+
+    You can also use Storable to do the job if you only work with Perl.
+    The advantage of load/save is the file is language agnostic.
+    This means the file saved using save can be loaded by other language binding of mxnet.
+    You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
+
+    Parameters
+    ----------
+    fname : str
+        The name of the file, examples:
+
+        - `s3://my-bucket/path/my-s3-symbol`
+        - `hdfs://my-bucket/path/my-hdfs-symbol`
+        - `/path-to/my-local-symbol`
+
+    Returns
+    -------
+    sym : Symbol
+        The loaded symbol.
+
+    See Also
+    --------
+    AI::MXNet::Symbol->save : Used to save symbol into file.
+=cut
+
+method load(Str $fname)
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolCreateFromFile($fname));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+=head2 load_json
+    Load symbol from json string.
+
+    Parameters
+    ----------
+    json_str : str
+        A json string.
+
+    Returns
+    -------
+    sym : Symbol
+        The loaded symbol.
+
+    See Also
+    --------
+    Symbol.tojson : Used to save symbol into json string.
+=cut
+
+method load_json(Str $json)
+{
+    my $handle = check_call(AI::MXNetCAPI::SymbolCreateFromJSON($json));
+    return __PACKAGE__->new(handle => $handle);
+}
+
+method zeros(Shape :$shape, Dtype :$dtype='float32', Str :$name)
+{
+    return __PACKAGE__->_zeros({ shape => $shape, dtype => $dtype, name => $name });
+}
+
+method ones(Shape :$shape, Dtype :$dtype='float32', Str :$name)
+{
+    return __PACKAGE__->_ones({ shape => $shape, dtype => $dtype, name => $name });
+}
+
+=head2 arange
+
+    Simlar function in the MXNet ndarray as numpy.arange
+        See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html.
+
+    Parameters
+    ----------
+    start : number
+        Start of interval. The interval includes this value. The default start value is 0.
+    stop : number, optional
+        End of interval. The interval does not include this value.
+    step : number, optional
+        Spacing between values
+    repeat : int, optional
+        "The repeating time of all elements.
+        E.g repeat=3, the element a will be repeated three times --> a, a, a.
+    dtype : type, optional
+        The value type of the NDArray, default to np.float32
+
+    Returns
+    -------
+    out : Symbol
+        The created Symbol
+=cut
+
+method arange(Index :$start=0, Index :$stop=, Num :$step=1.0, Index :$repeat=1, Str :$name, Dtype :$dtype='float32')
+{
+    return __PACKAGE__->_arange({
+                 start => $start, (defined $stop ? (stop => $stop) : ()),
+                 step => $step, repeat => $repeat, name => $name, dtype => $dtype
+    });
+}
+
+sub _parse_arguments
+{
+    my $type = shift;
+    my @args = @_;
+    my $type_c = find_type_constraint($type);
+    my $str_c  = find_type_constraint("Str");
+    my @positional_arguments;
+    my %kwargs;
+    my @kwargs_order;
+    my $only_dtypes_and_undefs = (@args == grep { not defined($_) or $type_c->check($_) } @args);
+    my $only_dtypes_and_strs   = (@args == grep { $type_c->check($_) or $str_c->check($_) } @args);
+    if(@args % 2 and $only_dtypes_and_undefs)
+    {
+        @positional_arguments = @args;
+    }
+    else
+    {
+        if($only_dtypes_and_undefs)
+        {
+            @positional_arguments = @args;
+        }
+        elsif($only_dtypes_and_strs)
+        {
+            my %tmp = @args;
+            if(values(%tmp) == grep { $type_c->check($_) } values(%tmp))
+            {
+                %kwargs = %tmp;
+                my $i = 0;
+                @kwargs_order = grep { $i ^= 1 } @args;
+            }
+            else
+            {
+                confess("Argument need to be of type $type");
+            }
+        }
+        else
+        {
+            confess("Argument need to be one type $type");
+        }
+    }
+    return (\@positional_arguments, \%kwargs, \@kwargs_order);
+}
+
+sub  _ufunc_helper
+{
+    my ($lhs, $rhs, $fn_symbol, $lfn_scalar, $rfn_scalar, $reverse) = @_;
+    ($rhs, $lhs) = ($lhs, $rhs) if $reverse and $rfn_scalar;
+    if(not ref $lhs)
+    {
+        if(not $rfn_scalar)
+        {
+            return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $rhs, { "scalar" => $lhs });
+        }
+        else
+        {
+            return __PACKAGE__->can($rfn_scalar)->(__PACKAGE__, $rhs, { "scalar" => $lhs });
+        }
+    }
+    elsif(not ref $rhs)
+    {
+        return __PACKAGE__->can($lfn_scalar)->(__PACKAGE__, $lhs, { "scalar" => $rhs });
+    }
+    else
+    {
+        return __PACKAGE__->can($fn_symbol)->(__PACKAGE__, $lhs, $rhs);
+    }
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/AttrScope.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/AttrScope.pm
new file mode 100644
index 000000000000..cbc364a7faa1
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/AttrScope.pm
@@ -0,0 +1,69 @@
+package AI::MXNet::Symbol::AttrScope;
+use strict;
+use warnings;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    return $class->$orig(attr => {@_});
+};
+
+=head1 NAME
+
+AI::MXNet::Symbol::AttrScope - Attribute manager for local scoping.
+
+=head1 DESCRIPTION
+
+Attribute manager for scoping.
+
+User can also inherit this object to change naming behavior.
+
+Parameters
+----------
+kwargs
+    The attributes to set for all symbol creations in the scope.
+=cut
+
+has 'attr' => (
+    is => 'ro',
+    isa => 'HashRef[Str]',
+);
+
+=head2 current
+
+Get the attribute hash ref given the attribute set by the symbol.
+
+Returns
+-------
+attr : current value of the class singleton object
+=cut
+
+method current()
+{
+    $AI::MXNet::curr_attr_scope;
+}
+
+=head2 get
+
+Get the attribute hash ref given the attribute set by the symbol.
+
+Parameters
+----------
+$attr : Maybe[HashRef[Str]]
+    The attribute passed in by user during symbol creation.
+
+Returns
+-------
+$attr : HashRef[Str]
+    The attributes updated to include another the scope related attributes.
+=cut
+
+method get(Maybe[HashRef[Str]] $attr=)
+{
+    return bless($attr//{}, 'AI::MXNet::Util::Printable') unless %{ $self->attr };
+    my %ret = (%{ $self->attr }, %{ $attr//{} });
+    return bless (\%ret, 'AI::MXNet::Util::Printable');
+}
+
+$AI::MXNet::curr_attr_scope = __PACKAGE__->new;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
new file mode 100644
index 000000000000..c801ebc232f9
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
@@ -0,0 +1,172 @@
+package AI::MXNet::Symbol::Base;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Symbol::AttrScope;
+use AI::MXNet::Symbol::Doc;
+use AI::MXNet::Symbol::NameManager;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+AI::MXNet::Symbol::Base
+=cut
+
+=head1 DESCRIPTION
+
+A convenience class that loads all C++m symbol related functions at runtime.
+=cut
+
+my %function_meta;
+method function_meta($code)
+{
+    return $function_meta{$code};
+}
+
+method function_meta_hash()
+{
+    return \%function_meta;
+}
+
+sub _compose
+{
+    my $self = shift;
+    my (@args, %kwargs);
+    while(ref $_[0])
+    {
+        push @args, shift(@_);
+    }
+    %kwargs = @_;
+    my $name = delete $kwargs{'name'};
+    if(@args and %kwargs)
+    {
+        confess("_compose only accept input Symbols \
+            either as positional or keyword arguments, not both");
+    }
+    if(grep { not blessed($_) or not $_->isa(__PACKAGE__) } (@args, values %kwargs))
+    {
+        confess("_compose expect 'Symbol' as arguments");
+    }
+
+    my $num_args = scalar(@args) + scalar(keys %kwargs);
+    my $keys = [];
+    my $args = [];
+    for my $key (keys %kwargs)
+    {
+        push @$keys, $key;
+        push @$args, $kwargs{ $key }->handle;
+    }
+    @$args = map { $_->handle } @args if @args;
+    check_call(
+        AI::NNVMCAPI::SymbolCompose(
+            $self->handle, $name, $num_args, $keys, $args
+        )
+    );
+}
+
+# Create an atomic symbol function by handle and funciton name
+func _make_atomic_symbol_function($handle, $name)
+{
+    my ($real_name, $desc, $arg_names, 
+        $arg_types, $arg_descs, $key_var_num_args,
+        $ret_type) = @{ check_call(AI::MXNetCAPI::SymbolGetAtomicSymbolInfo($handle)) };
+    $ret_type //= '';
+    my $func_name = $name;
+    my $doc_str = build_doc($func_name,
+                            $desc,
+                            $arg_names,
+                            $arg_types, 
+                            $arg_descs,
+                            $key_var_num_args,
+                            $ret_type
+    );
+    my $creator = sub {
+        my $class = shift;
+        my (@args, %kwargs);
+        if(
+            @_
+                and
+            ref $_[-1] eq 'HASH'
+                and
+            not (@_ >= 2 and not blessed $_[-2] and $_[-2] eq 'attr')
+        )
+        {
+            %kwargs = %{ pop(@_) };
+            @args = @_;
+        }
+        elsif(blessed $_[0] and $_[0]->isa(__PACKAGE__))
+        {
+            while(blessed $_[0] and $_[0]->isa(__PACKAGE__))
+            {
+                push @args, shift(@_);
+            }
+            %kwargs = @_;
+        }
+        else
+        {
+            %kwargs = @_;
+        }
+        my $params = {};
+        my $symbol_kwargs = {};
+        my $attr = delete $kwargs{ 'attr' };
+        %kwargs = (%kwargs, % { AI::MXNet::Symbol::AttrScope->current->get($attr) });
+        $name = delete $kwargs{ 'name' };
+        if($key_var_num_args and not exists $kwargs { $key_var_num_args })
+        {
+            $params->{ $key_var_num_args } = scalar(@args);
+        }
+        for my $key (keys %kwargs)
+        {
+            $kwargs{ $key } = "(" .join(", ", @{ $kwargs{ $key } }) .")"
+                if ref $kwargs{ $key } eq 'ARRAY';
+        }
+        while(my ($k, $v) = each %kwargs)
+        {
+            if(blessed($v) and $v->isa(__PACKAGE__))
+            {
+                $symbol_kwargs->{ $k } = $v;
+            }
+            else
+            {
+                $params->{ $k } = "$v";
+            }
+        }
+        # create atomic symbol
+        my $sym_handle = check_call(
+            AI::MXNetCAPI::SymbolCreateAtomicSymbol(
+                $handle,
+                scalar(keys %$params),
+                $params
+            )
+        );
+        my $s = $class->new(handle => $sym_handle);
+        my $hint = lc($func_name);
+        $name = AI::MXNet::Symbol::NameManager->current->get($name, $hint);
+        $s->_compose(@args, name => $name, %$symbol_kwargs);
+        return $s;
+    };
+    $function_meta{ $creator }{__name__} = $func_name;
+    $function_meta{ $creator }{__doc__} = $doc_str;
+    return $creator;
+}
+
+method _init_symbol_module()
+{
+    my $op_names = check_call(AI::MXNetCAPI::ListAllOpNames());
+    for my $name (@$op_names)
+    {
+        my $handle = check_call(AI::NNVMCAPI::GetOpHandle($name));
+        my $function = _make_atomic_symbol_function($handle, $name);
+        {
+            no strict 'refs';
+            {
+                *{__PACKAGE__."::$name"} = $function;
+            } 
+        }
+    }
+}
+
+__PACKAGE__->_init_symbol_module;
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Doc.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Doc.pm
new file mode 100644
index 000000000000..2485f21fe3a0
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Doc.pm
@@ -0,0 +1,44 @@
+package AI::MXNet::Symbol::Doc;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use Exporter;
+use base qw(Exporter);
+@AI::MXNet::Symbol::Doc::EXPORT = qw/build_doc/;
+
+method get_output_shape(AI::MXNet::Symbol $sym, %input_shapes)
+{
+    my $s_outputs = $sym->infer_shape(%input_shapes);
+    my %ret;
+    @ret{ @{ $sym->list_outputs() } } = @$s_outputs;
+    return bless \%ret, 'AI::MXNet::Util::Printable';
+}
+
+func build_doc(
+                    Str $func_name,
+                    Str $desc,
+                    ArrayRef[Str] $arg_names,
+                    ArrayRef[Str] $arg_types,
+                    ArrayRef[Str] $arg_desc,
+                    Str $key_var_num_args=,
+                    Str $ret_type=
+)
+{
+    my $param_str = build_param_doc($arg_names, $arg_types, $arg_desc);
+    if($key_var_num_args)
+    {
+        $desc .= "\nThis function support variable length of positional input."
+    }
+    my $doc_str = sprintf("%s\n\n" .
+               "%s\n" .
+               "name : string, optional.\n" .
+               "    Name of the resulting symbol.\n\n" .
+               "Returns\n" .
+               "-------\n" .
+               "symbol: Symbol\n" .
+               "    The result symbol.", $desc, $param_str);
+    return $doc_str;
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
new file mode 100644
index 000000000000..3dd18290ac42
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
@@ -0,0 +1,93 @@
+package AI::MXNet::Symbol::NameManager;
+use strict;
+use warnings;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+=head1
+
+NameManager that does an automatic naming.
+
+A user can also inherit this object to change the naming behavior.
+=cut
+
+has 'counter' => (
+    is => 'ro',
+    isa => 'HashRef',
+    default => sub { +{} }
+);
+
+our $current;
+
+=head2 get
+
+Get the canonical name for a symbol.
+
+This is default implementation.
+When user specified a name,
+the user specified name will be used.
+
+When user did not, we will automatically generate a
+name based on hint string.
+
+Parameters
+----------
+name : str or undef
+    The name the user has specified.
+
+hint : str
+    A hint string, which can be used to generate name.
+
+Returns
+-------
+full_name : str
+    A canonical name for the symbol.
+=cut
+
+method get(Str|Undef $name, Str $hint)
+{
+    return $name if $name;
+    if(not exists $self->counter->{ $hint })
+    {
+        $self->counter->{ $hint } = 0;
+    }
+    $name = sprintf("%s%d", $hint, $self->counter->{ $hint });
+    $self->counter->{ $hint }++;
+    return $name;
+}
+
+method current()
+{
+    $AI::MXNet::current_nm_ldr;
+}
+
+$AI::MXNet::current_nm_ldr = __PACKAGE__->new;
+
+package AI::MXNet::Symbol::Prefix;
+use Mouse;
+
+=head1 NAME
+
+AI::MXNet::Symbol::Prefix
+=cut
+
+extends 'AI::MXNet::Symbol::NameManager';
+
+=head1 DESCRIPTION
+
+A name manager that always attaches a prefix to all names.
+=cut
+
+has prefix => (
+    is => 'ro',
+    isa => 'Str',
+    required => 1
+);
+
+method get(Str $name, Str $hint)
+{
+    $name = $self->SUPER::get($name, $hint);
+    return $self->prefix . $name;
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
new file mode 100644
index 000000000000..0a1a360f0c4b
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
@@ -0,0 +1,350 @@
+package AI::MXNet::TestUtils;
+use strict;
+use warnings;
+use PDL;
+use Carp;
+use Scalar::Util qw(blessed);
+use AI::MXNet::Function::Parameters;
+use Exporter;
+use base qw(Exporter);
+@AI::MXNet::TestUtils::EXPORT_OK = qw(same reldiff almost_equal GetMNIST_ubyte
+                                      GetCifar10 pdl_maximum pdl_minimum mlp2 conv
+                                      check_consistency zip assert enumerate);
+use constant default_numerical_threshold => 1e-6;
+=head2 same
+
+Test if two pdl arrays are the same
+
+    Parameters
+    ----------
+    a : pdl
+    b : pdl
+=cut
+
+func same(PDL $a, PDL $b)
+{
+    return ($a != $b)->sum == 0;
+}
+
+=head2 reldiff
+    Calculate the relative difference between two input arrays
+
+    Calculated by :math:`\\frac{|a-b|_1}{|a|_1 + |b|_1}`
+
+    Parameters
+    ----------
+    a : pdl
+    b : pdl
+=cut
+
+func reldiff(PDL $a, PDL $b)
+{
+    my $diff = sum(abs($a - $b));
+    my $norm = sum(abs($a)) + sum(abs($b));
+    if($diff == 0)
+    {
+        return 0;
+    }
+    my $ret = $diff / $norm;
+    return $ret;
+}
+
+=head2 almost_equal
+
+Test if two pdl arrays are almost equal.
+=cut
+
+func almost_equal(PDL $a, PDL $b, Maybe[Num] $threshold=)
+{
+    $threshold //= default_numerical_threshold;
+    my $rel = reldiff($a, $b);
+    return $rel <= $threshold;
+}
+
+func GetMNIST_ubyte()
+{
+    if(not -d "data")
+    {
+        mkdir "data";
+    }
+    if (
+        not -f 'data/train-images-idx3-ubyte'
+            or
+        not -f 'data/train-labels-idx1-ubyte'
+            or
+        not -f 'data/t10k-images-idx3-ubyte'
+            or
+        not -f 'data/t10k-labels-idx1-ubyte'
+    )
+    {
+        `wget http://data.mxnet.io/mxnet/data/mnist.zip -P data`;
+        chdir 'data';
+        `unzip -u mnist.zip`;
+        chdir '..';
+    }
+}
+
+func GetCifar10()
+{
+    if(not -d "data")
+    {
+        mkdir "data";
+    }
+    if (not -f 'data/cifar10.zip')
+    {
+        `wget http://data.mxnet.io/mxnet/data/cifar10.zip -P data`;
+        chdir 'data';
+        `unzip -u cifar10.zip`;
+        chdir '..';
+    }
+}
+
+func _pdl_compare(PDL $a, PDL|Num $b, Str $criteria)
+{
+    if(not blessed $b)
+    {
+        my $tmp = $b;
+        $b = $a->copy;
+        $b .= $tmp;
+    }
+    my $mask = {
+        'max' => sub { $_[0] < $_[1] },
+        'min' => sub { $_[0] > $_[1] },
+    }->{$criteria}->($a, $b);
+    my $c = $a->copy;
+    $c->where($mask) .= $b->where($mask);
+    $c;
+}
+
+func pdl_maximum(PDL $a, PDL|Num $b)
+{
+    _pdl_compare($a, $b, 'max');
+}
+
+func pdl_minimum(PDL $a, PDL|Num $b)
+{
+    _pdl_compare($a, $b, 'min');
+}
+
+func mlp2()
+{
+    my $data = AI::MXNet::Symbol->Variable('data');
+    my $out  = AI::MXNet::Symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>1000);
+    $out     = AI::MXNet::Symbol->Activation(data=>$out, act_type=>'relu');
+    $out     = AI::MXNet::Symbol->FullyConnected(data=>$out, name=>'fc2', num_hidden=>10);
+    return $out;
+}
+
+func conv()
+{
+    my $data    = AI::MXNet::Symbol->Variable('data');
+    my $conv1   = AI::MXNet::Symbol->Convolution(data => $data, name=>'conv1', num_filter=>32, kernel=>[3,3], stride=>[2,2]);
+    my $bn1     = AI::MXNet::Symbol->BatchNorm(data => $conv1, name=>"bn1");
+    my $act1    = AI::MXNet::Symbol->Activation(data => $bn1, name=>'relu1', act_type=>"relu");
+    my $mp1     = AI::MXNet::Symbol->Pooling(data => $act1, name => 'mp1', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
+
+    my $conv2   = AI::MXNet::Symbol->Convolution(data => $mp1, name=>'conv2', num_filter=>32, kernel=>[3,3], stride=>[2,2]);
+    my $bn2     = AI::MXNet::Symbol->BatchNorm(data => $conv2, name=>"bn2");
+    my $act2    = AI::MXNet::Symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
+    my $mp2     = AI::MXNet::Symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
+
+    my $fl      = AI::MXNet::Symbol->Flatten(data => $mp2, name=>"flatten");
+    my $fc2     = AI::MXNet::Symbol->FullyConnected(data => $fl, name=>'fc2', num_hidden=>10);
+    my $softmax = AI::MXNet::Symbol->SoftmaxOutput(data => $fc2, name => 'sm');
+    return $softmax;
+}
+
+=head2 check_consistency
+
+    Check symbol gives the same output for different running context
+
+    Parameters
+    ----------
+    sym : Symbol or list of Symbols
+        symbol(s) to run the consistency test
+    ctx_list : list
+        running context. See example for more detail.
+    scale : float, optional
+        standard deviation of the inner normal distribution. Used in initialization
+    grad_req : str or list of str or dict of str to str
+        gradient requirement.
+=cut
+
+my %dtypes = (
+    float32 => 0,
+    float64 => 1,
+    float16 => 2,
+    uint8   => 3,
+    int32   => 4
+);
+
+func check_consistency(
+    SymbolOrArrayOfSymbols              :$sym,
+    ArrayRef                            :$ctx_list,
+    Num                                 :$scale=1,
+    Str|ArrayRef[Str]|HashRef[Str]      :$grad_req='write',
+    Maybe[HashRef[AI::MXNet::NDArray]]  :$arg_params=,
+    Maybe[HashRef[AI::MXNet::NDArray]]  :$aux_params=,
+    Maybe[HashRef[Num]|Num]             :$tol=,
+    Bool                                :$raise_on_err=1,
+    Maybe[AI::MXNer::NDArray]           :$ground_truth=
+)
+{
+    $tol //= {
+        float16 => 1e-1,
+        float32 => 1e-3,
+        float64 => 1e-5,
+        uint8   => 0,
+        int32   => 0
+    };
+    $tol = {
+        float16 => $tol,
+        float32 => $tol,
+        float64 => $tol,
+        uint8   => $tol,
+        int32   => $tol
+    } unless ref $tol;
+
+    Test::More::ok(@$ctx_list > 1);
+    if(blessed $sym)
+    {
+        $sym = [($sym)x@$ctx_list];
+    }
+    else
+    {
+        Test::More::ok(@$sym == @$ctx_list);
+    }
+    my $output_names = $sym->[0]->list_outputs;
+    my $arg_names    = $sym->[0]->list_arguments;
+    my @exe_list;
+    zip(sub {
+        my ($s, $ctx) = @_;
+        Test::More::is_deeply($s->list_arguments, $arg_names);
+        Test::More::is_deeply($s->list_outputs, $output_names);
+        push @exe_list, $s->simple_bind(grad_req=>$grad_req, %$ctx);
+    }, $sym, $ctx_list);
+    $arg_params //= {};
+    $aux_params //= {};
+    my %arg_dict = %{ $exe_list[0]->arg_dict };
+    while(my ($n, $arr) = each %arg_dict)
+    {
+        if(not exists $arg_params->{$n})
+        {
+            $arg_params->{$n} = random(reverse @{ $arr->shape })*$scale;
+        }
+    }
+    my %aux_dict = %{ $exe_list[0]->aux_dict };
+    while(my ($n, $arr) = each %aux_dict)
+    {
+        if(not exists $aux_params->{$n})
+        {
+            $aux_params->{$n} = 0;
+        }
+    }
+    for my $exe(@exe_list)
+    {
+        %arg_dict = %{ $exe->arg_dict };
+        while(my ($name, $arr) = each %arg_dict)
+        {
+            $arr .= $arg_params->{$name};
+        }
+        %aux_dict = %{ $exe->aux_dict };
+        while(my ($name, $arr) = each %aux_dict)
+        {
+            $arr .= $aux_params->{$name};
+        }
+    }
+    my @dtypes = map { $_->outputs->[0]->dtype } @exe_list;
+    my $max_idx = pdl(map { $dtypes{$_} } @dtypes)->maximum_ind;
+    my $gt = $ground_truth;
+    if(not defined $gt)
+    {
+        $gt = { %{ $exe_list[$max_idx]->output_dict } };
+        if($grad_req ne 'null')
+        {
+            %{$gt} = (%{$gt}, %{ $exe_list[$max_idx]->grad_dict });
+        }
+    }
+
+    # test
+    for my $exe (@exe_list)
+    {
+        $exe->forward(0);
+    }
+    enumerate(sub {
+        my ($i, $exe) = @_;
+        if($i == $max_idx)
+        {
+            return;
+        }
+        zip(sub {
+            my ($name, $arr) = @_;
+            my $gtarr = $gt->{$name}->astype($dtypes[$i])->aspdl;
+            $arr = $arr->aspdl;
+            Test::More::ok(
+                almost_equal(
+                    $arr, $gtarr,
+                    $tol->{$dtypes[$i]}
+                )
+            );
+        }, $output_names, $exe->outputs);
+    }, \@exe_list);
+
+    # train
+    if ($grad_req ne 'null')
+    {
+        for my $exe (@exe_list)
+        {
+            $exe->forward(1);
+            $exe->backward($exe->outputs);
+        }
+        enumerate(sub {
+            my ($i, $exe) = @_;
+            return if($i == $max_idx);
+            zip(sub {
+                my ($name, $arr) = @_;
+                if (not defined $gt->{$name})
+                {
+                    Test::More::ok(not defined $arr);
+                    return;
+                }
+                my $gtarr = $gt->{$name}->astype($dtypes[$i])->aspdl;
+                $arr = $arr->aspdl;
+                Test::More::ok(
+                    almost_equal(
+                        $arr, $gtarr,
+                        $tol->{$dtypes[$i]}
+                    )
+                );
+            }, [@$output_names, @$arg_names], [@{ $exe->outputs }, @{ $exe->grad_arrays }]);
+        }, \@exe_list);
+    }
+    return $gt;
+}
+
+sub zip
+{
+    my ($sub, @arrays) = @_;
+    my $len = @{ $arrays[0] };
+    for (my $i = 0; $i < $len; $i++)
+    {
+        $sub->(map { $_->[$i] } @arrays);
+    }
+}
+
+sub enumerate
+{
+    my ($sub, @arrays) = @_;
+    my $len = @{ $arrays[0] };
+    zip($sub, [0..$len-1], @arrays);
+}
+
+sub assert
+{
+    my ($input, $error_str) = @_;
+    local($Carp::CarpLevel) = 1;
+    Carp::confess($error_str//'AssertionError')
+        unless $input;
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
new file mode 100644
index 000000000000..424591eb65a0
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
@@ -0,0 +1,43 @@
+package AI::MXNet::Types;
+use strict;
+use warnings;
+use Mouse::Util::TypeConstraints;
+use Exporter;
+use base qw(Exporter);
+@AI::MXNet::Types::EXPORT = qw(find_type_constraint enum);
+
+class_type 'PDL';
+class_type 'PDL::Matrix';
+class_type 'AI::MXNet::NDArray';
+class_type 'AI::MXNet::Symbol';
+class_type 'AI::MXNet::NDArray::Slice';
+class_type 'AI::MXNet::Executor';
+class_type 'AI::MXNet::DataDesc';
+class_type 'AI::MXNet::Callback';
+class_type 'AI::MXNet::EvalMetric';
+class_type 'AI::MXNet::DataParallelExecutorGroup';
+class_type 'AI::MXNet::Optimizer';
+class_type 'AI::MXNet::InitDesc';
+class_type 'AI::MXNet::IRHeader';
+subtype "AcceptableInput" => as "Num|PDL|PDL::Matrix|AI::MXNet::NDArray|AI::MXNet::NDArray::Slice|ArrayRef";
+subtype "Index"           => as "Int";
+subtype "DimSize"         => as "Int" => where { $_ >= 0 };
+subtype "Shape"           => as "ArrayRef[DimSize]";
+subtype "WholeDim"        => as "Str" => where { $_ eq 'X' };
+subtype "Slice"           => as "ArrayRef[Index]|WholeDim|Index" => where { ref $_ ? @$_ > 0 : 1 };
+subtype "Dtype"           => as enum([qw[float32 float64 float16 uint8 int32]]);
+subtype "Metric"          => as "Maybe[CodeRef|Str]";
+subtype "ProfilerMode"    => as enum([qw[symbolic all]]);
+subtype "ProfilerState"   => as enum([qw[stop run]]);
+subtype "GradReq"         => as enum([qw[add write null]]);
+subtype "NameShape"       => as "ArrayRef" => where {
+    find_type_constraint("Str")->check($_->[0])
+        and
+    find_type_constraint("Shape")->check($_->[1])
+};
+subtype "Callback"        => as "CodeRef|ArrayRef[Coderef]|AI::MXNet::Callback|ArrayRef[AI::MXNet::Callback]";
+subtype "EvalMetric"      => as "AI::MXNet::EvalMetric|Str|CodeRef";
+subtype "Optimizer"       => as "AI::MXNet::Optimizer|Str";
+subtype "Activation"      => as "AI::MXNet::Symbol|Str";
+subtype "SymbolOrArrayOfSymbols" => as "AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]";
+subtype "NameShapeOrDataDesc" => as "NameShape|AI::MXNet::DataDesc";
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Util/Printable.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Util/Printable.pm
new file mode 100644
index 000000000000..5b1230870fef
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Util/Printable.pm
@@ -0,0 +1,3 @@
+package AI::MXNet::Util::Printable;
+use Data::Dumper qw();
+use overload '""' => sub { print Data::Dumper->new([shift])->Purity(1)->Deepcopy(1)->Terse(1)->Dump }; 
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
new file mode 100644
index 000000000000..e942fa95b13b
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
@@ -0,0 +1,407 @@
+package AI::MXNet::Visualization;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use JSON::PP;
+
+=encoding UTF-8
+
+=head1 NAME
+
+AI::MXNet::Vizualization - Vizualization support for Perl interface to MXNet machine learning library
+
+=head1 SYNOPSIS
+
+    use strict;
+    use warnings;
+    use AI::MXNet qw(mx);
+
+    ### model
+    my $data = mx->symbol->Variable('data');
+    my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
+    my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
+    my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
+    my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
+
+    my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
+    my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
+    my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
+    my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
+
+
+    my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
+    my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
+    my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
+    my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
+    my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
+
+    ## creates the image file working directory
+    mx->viz->plot_network($softmax, save_format => 'png')->render("network.png"); 
+
+=head1 DESCRIPTION
+
+     Vizualization support for Perl interface to MXNet machine learning library
+
+=head1 Class methods
+
+=head2 print_summary
+
+    convert symbol for detail information
+
+    Parameters
+    ----------
+    symbol: AI::MXNet::Symbol
+        symbol to be visualized
+    shape: hashref
+        hashref of shapes, str->shape (arrayref[int]), given input shapes
+    line_length: int
+        total length of printed lines
+    positions: arrayref[float]
+        relative or absolute positions of log elements in each line
+    Returns
+    ------
+        nothing
+=cut
+
+method print_summary(
+    AI::MXNet::Symbol        $symbol,
+    Maybe[HashRef[Shape]]    $shape=,
+    Int                      $line_length=120,
+    ArrayRef[Num]            $positions=[.44, .64, .74, 1]
+)
+{
+    my $show_shape;
+    my %shape_dict;
+    if(defined $shape)
+    {
+        $show_shape = 1;
+        my $interals = $symbol->get_internals;
+        my (undef, $out_shapes, undef) = $interals->infer_shape(%{ $shape });
+        Carp::confess("Input shape is incomplete")
+            unless defined $out_shapes;
+        @shape_dict{ @{ $interals->list_outputs } } = @{ $out_shapes };
+    }
+    my $conf = decode_json($symbol->tojson);
+    my $nodes = $conf->{nodes};
+    my %heads = map { $_ => 1 } @{ $conf->{heads}[0] };
+    if($positions->[-1] <= 1)
+    {
+        $positions = [map { int($line_length * $_) } @{ $positions }];
+    }
+    # header names for the different log elements
+    my $to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Previous Layer'];
+    my $print_row = sub { my ($fields, $positions) = @_;
+        my $line = '';
+        enumerate(sub {
+            my ($i, $field) = @_;
+            $line .= $field//'';
+            $line = substr($line, 0, $positions->[$i]);
+            $line .= ' ' x ($positions->[$i] - length($line));
+
+        }, $fields);
+        print $line,"\n";
+    };
+    print('_' x $line_length,"\n");
+    $print_row->($to_display, $positions);
+    print('=' x $line_length,"\n");
+    my $print_layer_summary = sub { my ($node, $out_shape) = @_;
+        my $op = $node->{op};
+        my $pre_node = [];
+        my $pre_filter = 0;
+        if($op ne 'null')
+        {
+            my $inputs = $node->{inputs};
+            for my $item (@{ $inputs })
+            {
+                my $input_node = $nodes->[$item->[0]];
+                my $input_name = $input_node->{name};
+                if($input_node->{op} ne 'null' or exists $heads{ $item->[0] })
+                {
+                    push @{ $pre_node }, $input_name;
+                    if($show_shape)
+                    {
+                        my $key = $input_name;
+                        $key .= '_output' if $input_node->{op} ne 'null';
+                        if(exists $shape_dict{ $key })
+                        {
+                            $pre_filter = $pre_filter + int($shape_dict{$key}[1]//0);
+                        }
+                    }
+                }
+            }
+        }
+        my $cur_param = 0;
+        if($op eq 'Convolution')
+        {
+            my $num_filter = $node->{attr}{num_filter};
+            $node->{attr}{kernel} =~ /(\d+)\s*,\s*(\d+)/;
+            $cur_param = $pre_filter * $1 * $2 * $num_filter + $num_filter;
+        }
+        elsif($op eq 'FullyConnected')
+        {
+            $cur_param = $pre_filter * ($node->{attr}{num_hidden} + 1);
+        }
+        elsif($op eq 'BatchNorm')
+        {
+            my $key = "$node->{name}_output";
+            if($show_shape)
+            {
+                my $num_filter = $shape_dict{$key}[1];
+                $cur_param = $num_filter * 2;
+            }
+        }
+        my $first_connection;
+        if(not $pre_node)
+        {
+            $first_connection = '';
+        }
+        else
+        {
+            $first_connection = $pre_node->[0];
+        }
+        my $fields = [
+            $node->{name} . '(' . $op . ')',
+            join('x', @{ $out_shape }),
+            $cur_param,
+            $first_connection
+        ];
+        $print_row->($fields, $positions);
+        if(@{ $pre_node } > 1)
+        {
+            for my $i (1..@{ $pre_node }-1)
+            {
+                $fields = ['', '', '', $pre_node->[$i]];
+                $print_row->($fields, $positions);
+            }
+        }
+        return $cur_param;
+    };
+    my $total_params = 0;
+    enumerate(sub {
+        my ($i, $node) = @_;
+        my $out_shape = [];
+        my $op = $node->{op};
+        return if($op eq 'null' and $i > 0);
+        if($op ne 'null' or exists $heads{$i})
+        {
+            if($show_shape)
+            {
+                my $key = $node->{name};
+                $key .= '_output' if $op ne 'null';
+                if(exists $shape_dict{ $key })
+                {
+                    my $end = @{ $shape_dict{ $key } };
+                    @{ $out_shape } = @{ $shape_dict{ $key } }[1..$end-1];
+                }
+            }
+        }
+        $total_params += $print_layer_summary->($nodes->[$i], $out_shape);
+        if($i == @{ $nodes } - 1)
+        {
+            print('=' x $line_length, "\n");
+        }
+        else
+        {
+            print('_' x $line_length, "\n");
+        }
+    }, $nodes);
+    print("Total params: $total_params\n");
+    print('_' x $line_length, "\n");
+}
+
+=head2 plot_network
+
+    convert symbol to dot object for visualization
+
+    Parameters
+    ----------
+    title: str
+        title of the dot graph
+    symbol: AI::MXNet::Symbol
+        symbol to be visualized
+    shape: HashRef[Shape]
+        If supplied, the visualization will include the shape
+        of each tensor on the edges between nodes.
+    node_attrs: HashRef of node's attributes
+        for example:
+            {shape => "oval",fixedsize => "false"}
+            means to plot the network in "oval"
+    hide_weights: Bool
+        if True (default) then inputs with names like `*_weight`
+        or `*_bias` will be hidden
+
+    Returns
+    ------
+    dot: Diagraph
+        dot object of symbol
+=cut
+
+
+method plot_network(
+    AI::MXNet::Symbol       $symbol,
+    Str                    :$title='plot',
+    Str                    :$save_format='ps',
+    Maybe[HashRef[Shape]]  :$shape=,
+    HashRef[Str]           :$node_attrs={},
+    Bool                   :$hide_weights=1
+)
+{
+    eval { require GraphViz; };
+    Carp::confess("plot_network requires GraphViz module") if $@;
+    my $draw_shape;
+    my %shape_dict;
+    if(defined $shape)
+    {
+        $draw_shape = 1;
+        my $interals = $symbol->get_internals;
+        my (undef, $out_shapes, undef) = $interals->infer_shape(%{ $shape });
+        Carp::confess("Input shape is incomplete")
+            unless defined $out_shapes;
+        @shape_dict{ @{ $interals->list_outputs } } = @{ $out_shapes };
+    }
+    my $conf = decode_json($symbol->tojson);
+    my $nodes = $conf->{nodes};
+    my %node_attr = (
+        qw/ shape box fixedsize true
+            width 1.3 height 0.8034 style filled/,
+        %{ $node_attrs }
+    );
+    my $dot = AI::MXNet::Visualization::PythonGraphviz->new(
+        graph  => GraphViz->new(name => $title),
+        format => $save_format
+    );
+    # color map
+    my @cm = (
+        "#8dd3c7", "#fb8072", "#ffffb3", "#bebada", "#80b1d3",
+        "#fdb462", "#b3de69", "#fccde5"
+    );
+    # make nodes
+    my %hidden_nodes;
+    for my $node (@{ $nodes })
+    {
+        my $op   = $node->{op};
+        my $name = $node->{name};
+        # input data
+        my %attr = %node_attr;
+        my $label = $name;
+        if($op eq 'null')
+        {
+            if($name =~ /(?:_weight|_bias)$/)
+            {
+                if($hide_weights)
+                {
+                    $hidden_nodes{$name} = 1;
+                }
+                # else we don't render a node, but
+                # don't add it to the hidden_nodes set
+                # so it gets rendered as an empty oval
+                next;
+            }
+            $attr{shape} = 'ellipse'; # inputs get their own shape
+            $label = $name;
+            $attr{fillcolor} = $cm[0];
+        }
+        elsif($op eq 'Convolution')
+        {
+            my ($k0, $k1) = $node->{attr}{kernel}       =~ /(\d+)\s*,\s*(\d+)/;
+            my ($stride)  = ($node->{attr}{stride}//'') =~ /(\d+)\s*,\s*(\d+)/;
+            $stride //= 1;
+            $label = "Convolution\n${k0}x$k1/$stride, $node->{attr}{num_filter}";
+            $attr{fillcolor} = $cm[1];
+        }
+        elsif($op eq 'FullyConnected')
+        {
+            $label = "FullyConnected\n$node->{attr}{num_hidden}";
+            $attr{fillcolor} = $cm[1];
+        }
+        elsif($op eq 'BatchNorm')
+        {
+            $attr{fillcolor} = $cm[3];
+        }
+        elsif($op eq 'Activation' or $op eq 'LeakyReLU')
+        {
+            $label = "$op\n$node->{attr}{act_type}";
+            $attr{fillcolor} = $cm[2];
+        }
+        elsif($op eq 'Pooling')
+        {
+            my ($k0, $k1) = $node->{attr}{kernel}       =~ /(\d+)\s*,\s*(\d+)/;
+            my ($stride)  = ($node->{attr}{stride}//'') =~ /(\d+)\s*,\s*(\d+)/;
+            $stride //= 1;
+            $label = "Pooling\n$node->{attr}{pool_type}, ${k0}x$k1/$stride";
+            $attr{fillcolor} = $cm[4];
+        }
+        elsif($op eq 'Concat' or $op eq 'Flatten' or $op eq 'Reshape')
+        {
+            $attr{fillcolor} = $cm[5];
+        }
+        elsif($op eq 'Softmax')
+        {
+            $attr{fillcolor} = $cm[6];
+        }
+        else
+        {
+            $attr{fillcolor} = $cm[7];
+            if($op eq 'Custom')
+            {
+                $label = $node->{attr}{op_type};
+            }
+        }
+        $dot->graph->add_node($name, label => $label, %attr);
+    };
+    # add edges
+    for my $node (@{ $nodes })
+    {
+        my $op   = $node->{op};
+        my $name = $node->{name};
+        if($op eq 'null')
+        {
+            next;
+        }
+        else
+        {
+            my $inputs = $node->{inputs};
+            for my $item (@{ $inputs })
+            {
+                my $input_node = $nodes->[$item->[0]];
+                my $input_name = $input_node->{name};
+                if(not exists $hidden_nodes{ $input_name })
+                {
+                    my %attr = qw/dir back arrowtail normal/;
+                    # add shapes
+                    if($draw_shape)
+                    {
+                        my $key = $input_name;
+                        $key   .= '_output' if $input_node->{op} ne 'null';
+                        my $end = @{ $shape_dict{$key} };
+                        $attr{label} = join('x', @{ $shape_dict{$key} }[1..$end-1]);
+                    }
+                    $dot->graph->add_edge($name => $input_name, %attr);
+                }
+            }
+        }
+    }
+    return $dot;
+}
+
+package AI::MXNet::Visualization::PythonGraphviz;
+use Mouse;
+use AI::MXNet::Types;
+has 'format' => (
+    is => 'ro',
+    isa => enum([qw/debug canon text ps hpgl pcl mif
+                    pic gd gd2 gif jpeg png wbmp cmapx
+                    imap vdx vrml vtx mp fig svg svgz
+                    plain/]
+    )
+);
+has 'graph' => (is => 'ro', isa => 'GraphViz');
+
+method render($output=)
+{
+    my $method = 'as_' . $self->format;
+    return $self->graph->$method($output);
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/AI-MXNet.t b/perl-package/AI-MXNet/t/AI-MXNet.t
new file mode 100644
index 000000000000..7ded2317d58f
--- /dev/null
+++ b/perl-package/AI-MXNet/t/AI-MXNet.t
@@ -0,0 +1,4 @@
+use strict;
+use warnings;
+use Test::More tests => 1;
+BEGIN { use_ok('AI::MXNet') };
diff --git a/perl-package/AI-MXNet/t/test_attr.t b/perl-package/AI-MXNet/t/test_attr.t
new file mode 100644
index 000000000000..ea9d0eda433f
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_attr.t
@@ -0,0 +1,108 @@
+use strict;
+use warnings;
+use Test::More tests => 14;
+use AI::MXNet qw(mx);
+use Storable;
+
+sub contains
+{
+    my ($x, $y) = @_;
+    while(my ($k, $v) = each %$x)
+    {
+        return 0 unless exists $y->{$k};
+        if(ref $y->{$k} and ref $y->{$k} eq 'HASH')
+        {
+            return 0 unless (ref $v and ref $v eq 'HASH');
+            return 0 unless contains($v, $y->{$k});
+        }
+        elsif($y->{$k} ne $v)
+        {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+sub test_attr_basic
+{
+    my ($data, $gdata);
+    {
+        local($mx::AttrScope) = mx->AttrScope(group=>'4', data=>'great');
+        $data = mx->symbol->Variable(
+            'data',
+            attr => {
+                qw/ dtype data
+                    group 1
+                    force_mirroring 1/
+            },
+            lr_mult => 1);
+        $gdata = mx->symbol->Variable('data2');
+    }
+    ok($gdata->attr('group') == 4);
+    ok($data->attr('group') == 1);
+    ok($data->attr('lr_mult') == 1);
+    ok($data->attr('__lr_mult__') == 1);
+    ok($data->attr('force_mirroring') == 1);
+    ok($data->attr('__force_mirroring__') == 1);
+    my $data2 = Storable::thaw(Storable::freeze($data));
+    ok($data->attr('dtype') eq $data2->attr('dtype'));
+}
+
+sub test_operator
+{
+    my $data = mx->symbol->Variable('data');
+    my ($fc1, $fc2);
+    {
+        local($mx::AttrScope) = mx->AttrScope(__group__=>'4', __data__=>'great');
+        $fc1 = mx->symbol->Activation($data, act_type=>'relu');
+        {
+            local($mx::AttrScope) = mx->AttrScope(__init_bias__ => 0, 
+                __group__=>'4', __data__=>'great');
+            $fc2 = mx->symbol->FullyConnected($fc1, num_hidden=>10, name=>'fc2');
+        }
+    }
+    ok($fc1->attr('__data__') eq 'great');
+    ok($fc2->attr('__data__') eq 'great');
+    ok($fc2->attr('__init_bias__') == 0);
+    my $fc2copy = Storable::thaw(Storable::freeze($fc2));
+    ok($fc2copy->tojson() eq $fc2->tojson());
+    ok($fc2->get_internals()->slice('fc2_weight'));
+}
+
+sub test_list_attr
+{
+    my $data = mx->sym->Variable('data', attr=>{'mood', 'angry'});
+    my $op = mx->sym->Convolution(
+        data=>$data, name=>'conv', kernel=>[1, 1],
+        num_filter=>1, attr => {'__mood__'=> 'so so', 'wd_mult'=> 'x'}
+    );
+    ok(contains({'__mood__'=> 'so so', 'wd_mult'=> 'x', '__wd_mult__'=> 'x'}, $op->list_attr()));
+}
+
+sub test_attr_dict
+{
+    my $data = mx->sym->Variable('data', attr=>{'mood'=> 'angry'});
+    my $op = mx->sym->Convolution(
+        data=>$data, name=>'conv', kernel=>[1, 1],
+        num_filter=>1, attr=>{'__mood__'=> 'so so'}, lr_mult=>1
+    );
+    ok(
+        contains(
+            {
+                'data'=> {'mood'=> 'angry'},
+                'conv_weight'=> {'__mood__'=> 'so so'},
+                'conv'=> {
+                    'kernel'=> '(1, 1)', '__mood__'=> 'so so', 
+                    'num_filter'=> '1', 'lr_mult'=> '1', '__lr_mult__'=> '1'
+                },
+                'conv_bias'=> {'__mood__'=> 'so so'}
+            },
+            $op->attr_dict()
+        )
+    );
+}
+
+test_attr_basic();
+test_operator();
+test_list_attr();
+test_attr_dict();
diff --git a/perl-package/AI-MXNet/t/test_conv.t b/perl-package/AI-MXNet/t/test_conv.t
new file mode 100644
index 000000000000..cabd6ca05da2
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_conv.t
@@ -0,0 +1,55 @@
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use AI::MXNet::TestUtils qw(GetMNIST_ubyte);
+use Test::More tests => 1;
+
+## speed up the tests when gpu present
+my $gpu_present = (`perl -e 'use AI::MXNet qw(mx); print mx->nd->ones([1], ctx => mx->gpu(0))->asscalar' 2>/dev/null` eq '1');
+
+# symbol net
+my $batch_size = 100;
+
+### model
+my $data = mx->symbol->Variable('data');
+my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
+my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
+my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
+my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
+
+my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
+my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
+my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
+my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
+
+
+my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
+my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
+my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
+my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
+my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
+
+# check data
+GetMNIST_ubyte();
+
+my $train_dataiter = mx->io->MNISTIter({
+        image=>"data/train-images-idx3-ubyte",
+        label=>"data/train-labels-idx1-ubyte",
+        data_shape=>[1, 28, 28],
+        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0, seed=>10});
+my $val_dataiter = mx->io->MNISTIter({
+        image=>"data/t10k-images-idx3-ubyte",
+        label=>"data/t10k-labels-idx1-ubyte",
+        data_shape=>[1, 28, 28],
+        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0});
+
+my $n_epoch = 1;
+my $mod = mx->mod->new(symbol => $softmax, ($gpu_present ? (context => mx->gpu(0)) : ()));
+$mod->fit(
+    $train_dataiter,
+    eval_data => $val_dataiter,
+    optimizer_params=>{learning_rate=>0.01, momentum=> 0.9},
+    num_epoch=>$n_epoch
+);
+my $res = $mod->score($val_dataiter, mx->metric->create('acc'));
+ok($res->{accuracy} > 0.8);
diff --git a/perl-package/AI-MXNet/t/test_executor.t b/perl-package/AI-MXNet/t/test_executor.t
new file mode 100644
index 000000000000..d6439b61aee6
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_executor.t
@@ -0,0 +1,174 @@
+use strict;
+use warnings;
+use Test::More tests => 2283;
+use AI::MXNet qw(mx);
+use AI::MXNet::TestUtils qw(reldiff pdl_maximum pdl_minimum);
+use PDL;
+
+sub check_bind_with_uniform
+{
+    my ($uf, $gf, $dim, $sf, $lshape, $rshape) = @_;
+    my $shape = (random($dim)*int(1000**(1.0/$dim))+1)->floor->unpdl;
+    my $lhs = mx->symbol->Variable('lhs');
+    my $rhs = mx->symbol->Variable('rhs');
+    my $ret;
+    if(defined $sf)
+    {
+        $ret = &{$sf}($lhs, $rhs);
+    }
+    else
+    {
+        $ret = &{$uf}($lhs, $rhs);
+    }
+
+    is_deeply($ret->list_arguments(), ['lhs', 'rhs']);
+    $lshape //= $shape;
+    $rshape //= $shape;
+
+    my $lhs_arr = mx->nd->array(random(reverse (@$lshape)));
+    my $rhs_arr = mx->nd->array(random(reverse (@$rshape)));
+    my $lhs_grad = mx->nd->empty($lshape);
+    my $rhs_grad = mx->nd->empty($rshape);
+    my $executor = $ret->bind(
+        ctx       => mx->Context('cpu'),
+        args      => [$lhs_arr, $rhs_arr],
+        args_grad => [$lhs_grad, $rhs_grad]
+    );
+
+    my $exec3 = $ret->bind(
+        ctx  => mx->Context('cpu'),
+        args => [$lhs_arr, $rhs_arr]
+    );
+
+    my $exec4 = $ret->bind(
+        ctx  => mx->Context('cpu'),
+        args => {'rhs' => $rhs_arr, 'lhs' => $lhs_arr},
+        args_grad=>{'lhs' => $lhs_grad, 'rhs' => $rhs_grad}
+    );
+
+    $executor->forward(1);
+    $exec3->forward(1);
+    $exec4->forward(1);
+    my $out2 = $executor->outputs->[0]->aspdl;
+    my $out1 = &{$uf}($lhs_arr->aspdl, $rhs_arr->aspdl);
+    my $out3 = $exec3->outputs->[0]->aspdl;
+    my $out4 = $exec4->outputs->[0]->aspdl;
+    ok(reldiff($out1, $out2) < 1e-6);
+    ok(reldiff($out1, $out3) < 1e-6);
+    ok(reldiff($out1, $out4) < 1e-6);
+    # test gradient
+
+    my $out_grad = mx->nd->ones([reverse @{$out2->shape->unpdl}]);
+    my ($lhs_grad2, $rhs_grad2) = &{$gf}(
+        $out_grad->aspdl,
+        $lhs_arr->aspdl,
+        $rhs_arr->aspdl
+    );
+    $executor->backward([$out_grad]);
+
+    ok(reldiff($lhs_grad->aspdl, $lhs_grad2) < 1e-6);
+    ok(reldiff($rhs_grad->aspdl, $rhs_grad2) < 1e-6);
+}
+
+sub test_bind
+{
+    my ($disable_bulk_exec) = @_;
+    my ($prev_fwd_var, $prev_bwd_var);
+    if($disable_bulk_exec)
+    {
+        $prev_fwd_var = $ENV{MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN}//1;
+        $prev_bwd_var = $ENV{MXNET_EXEC_BULK_BWD_TRAIN}//1;
+        $ENV{MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN} = 0;
+        $ENV{MXNET_EXEC_BULK_BWD_TRAIN} = 0;
+    }
+    srand(0);
+    my $nrepeat = 9;
+    my $maxdim = 3;
+    for my $repeat (0..$nrepeat)
+    {
+        for my $dim (1..$maxdim)
+        {
+            check_bind_with_uniform(sub { my ($x, $y) = @_; $x + $y },
+                                    sub { my ($g) = @_; ($g, $g) },
+                                    $dim);
+            check_bind_with_uniform(sub { my ($x, $y) = @_; $x - $y },
+                                    sub { my ($g) = @_; ($g, -$g) },
+                                    $dim);
+            check_bind_with_uniform(sub { my ($x, $y) = @_; $x * $y },
+                                    sub { my ($g, $x, $y) = @_; ($g*$y, $g*$x) },
+                                    $dim);
+            check_bind_with_uniform(sub { my ($x, $y) = @_; $x / $y },
+                                    sub { my ($g, $x, $y) = @_; ($g / $y, -$x * $g/ ($y**2)) },
+                                    $dim);
+            check_bind_with_uniform(sub { my ($x, $y) = @_; pdl_maximum($x, $y) },
+                                    sub { my ($g, $x, $y) = @_; ($g * ($x>$y), $g * ($y>$x)) },
+                                    $dim,
+                                    sub { $_[0]->maximum($_[1]) });
+            check_bind_with_uniform(sub { my ($x, $y) = @_; pdl_minimum($x, $y) },
+                                    sub { my ($g, $x, $y) = @_; ($g * ($x<$y), $g * ($y<$x)) },
+                                    $dim,
+                                    sub { $_[0]->minimum($_[1]) });
+        }
+    }
+    if($disable_bulk_exec)
+    {
+        $ENV{MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN} = $prev_fwd_var;
+        $ENV{MXNET_EXEC_BULK_BWD_TRAIN}           = $prev_bwd_var;
+    }
+}
+
+
+sub test_dot
+{
+    srand(0);
+    my $nrepeat = 9;
+    my $maxdim = 4;
+    for my $repeat (0..$nrepeat)
+    {
+        my $shape = (random(3)*500+1)->floor->unpdl;
+        check_bind_with_uniform(sub { my ($x, $y) = @_; $x x $y },
+                                sub { my ($g, $x, $y) = @_; ($g x $y->transpose, $x->transpose x $g) },
+                                2,
+                                sub { mx->symbol->dot(@_) },
+                                [@{$shape}[0, 1]],
+                                [@{$shape}[1, 2]],
+        );
+    }
+    for my $repeat (0..$nrepeat)
+    {
+        my $shape = (random(1)*500+1)->floor->unpdl;
+        check_bind_with_uniform(sub { my ($x, $y) = @_; $x x $y->transpose },
+                                sub { my ($g, $x, $y) = @_; ($g * $y, $g * $x) },
+                                2,
+                                sub { mx->symbol->dot(@_) },
+                                [@{$shape}[0]],
+                                [@{$shape}[0]],
+        );
+    }
+}
+
+sub test_reshape
+{
+    my $x = mx->sym->Variable('x');
+    my $y = mx->sym->FullyConnected($x, num_hidden=>4);
+
+    my $exe = $y->simple_bind(ctx => mx->cpu(), shapes => { x=>[5,4] });
+    $exe->arg_arrays->[0] .= 1;
+    $exe->arg_arrays->[1] .= mx->nd->ones([4,4]);
+    $exe->arg_arrays->[2] .= 0;
+
+    my $new_exe = $exe->reshape({ x=>[3,4] });
+    $new_exe->forward(0);
+    # test sub exec forward
+    ok(($new_exe->outputs->[0]->aspdl == 4)->all);
+    # test shared memory
+    ok(($exe->outputs->[0]->aspdl->slice('X', [0,2]) == 4)->all);
+    # test base exec forward
+    $exe->forward(0);
+    ok(($new_exe->outputs->[0]->aspdl == 4)->all);
+}
+
+test_bind(0);
+test_bind(1);
+test_dot();
+test_reshape();
diff --git a/perl-package/AI-MXNet/t/test_infer_shape.t b/perl-package/AI-MXNet/t/test_infer_shape.t
new file mode 100644
index 000000000000..f23b643abfb4
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_infer_shape.t
@@ -0,0 +1,125 @@
+use strict;
+use warnings;
+use Test::More tests => 18;
+use AI::MXNet qw(mx);
+use AI::MXNet::TestUtils qw(mlp2);
+
+sub _test_shapes
+{
+    my ($sym, $arg_shapes, %expected_shapes) = @_;
+    my %arg_shape_dict;
+    @arg_shape_dict{ @{ $sym->list_arguments() } } = @{ $arg_shapes };
+    while(my ($k, $v) = each %expected_shapes)
+    {
+        is_deeply($arg_shape_dict{$k}, $v);
+    }
+}
+
+sub test_mlp2_infer_shape
+{
+    # Build MLP
+    my $out = mlp2();
+    # infer shape
+    my $data_shape = [100, 100];
+    my($arg_shapes, $out_shapes, $aux_shapes) = $out->infer_shape(data=>$data_shape);
+    ok(@$out_shapes == 1);
+    is_deeply($out_shapes->[0], [100, 10]);
+    my %true_shapes = (
+        fc2_bias   => [10],
+        fc2_weight => [10, 1000],
+        fc1_bias   => [1000],
+        fc1_weight => [1000,100]
+    );
+    _test_shapes($out, $arg_shapes, %true_shapes);
+}
+
+sub test_mlp2_infer_error
+{
+    # Test shape inconsistent case
+    my $out = mlp2();
+    my $weight_shape = [1, 100];
+    my $data_shape   = [100, 100];
+    eval { $out->infer_shape(data=>$data_shape, fc1_weight=>$weight_shape) };
+    like($@, qr/Shape inconsistent/);
+}
+
+sub test_backward_infer
+{
+    my $w = mx->sym->Variable("weight");
+    my $wshift = mx->sym->Variable("wshift", shape=>[1]);
+    my $data = mx->sym->Variable("data");
+    # broadcast add here, not being able to deduce shape correctly
+    my $wt = mx->sym->broadcast_add($w, $wshift);
+    # shape constraint, this is what enables backward shape inference
+    $wt = mx->sym->_identity_with_attr_like_rhs($wt, $w);
+    my $net = mx->sym->FullyConnected(data=>$data, weight=>$wt, num_hidden=>11, no_bias=>1);
+    my $data_shape = [7, 100];
+    my ($arg_shapes, $out_shapes, $aux_shapes) = $net->infer_shape(data=>$data_shape);
+    _test_shapes($net, $arg_shapes, weight=>[11,100]);
+}
+
+sub test_incomplete_infer_elewise
+{
+    my $a = mx->sym->Variable('a', shape=>[0, 10]);
+    my $b = mx->sym->Variable('b', shape=>[12, 0]);
+    my $c = $a + $b;
+    my ($arg_shapes) = $c->infer_shape();
+    _test_shapes($c, $arg_shapes, a=>[12,10], b=>[12,10]);
+}
+
+sub test_incomplete_infer_mlp
+{
+    my $a = mx->sym->Variable('a', shape=>[0, 10]);
+    my $b = mx->sym->FullyConnected(data=>$a, num_hidden=>21);
+    my $c = mx->sym->Variable('c', shape=>[5, 0]);
+    my $d = $b + $c;
+    my ($arg_shapes) = $d->infer_shape();
+    _test_shapes($d, $arg_shapes, a=>[5,10], c=>[5,21]);
+}
+
+sub test_incomplete_infer_slicechannel
+{
+    my $a = mx->sym->Variable('a', shape=>[0, 10]);
+    my $b = mx->sym->SliceChannel(data=>$a, num_outputs=>10, axis=>1, squeeze_axis=>1);
+    my $c = mx->sym->Variable('c', shape=>[5]);
+    my $d = @{$b}[1] + $c;
+    my ($arg_shapes) = $d->infer_shape();
+    _test_shapes($d, $arg_shapes, a=>[5,10]);
+
+    $a = mx->sym->Variable('a', shape=>[0, 15, 0]);
+    $b = mx->sym->SliceChannel(data=>$a, num_outputs=>3, squeeze_axis=>0);
+    $c = mx->sym->Variable('c', shape=>[3, 5, 2]);
+    $d = @{$b}[1] + $c;
+    ($arg_shapes) = $d->infer_shape();
+    _test_shapes($d, $arg_shapes, a=>[3,15,2]);
+}
+
+sub test_incomplete_infer_convolution
+{
+    my $a = mx->sym->Variable('a', shape=>[0, 10, 0, 0]);
+    my $b = mx->sym->Convolution(data=>$a, num_filter=>21, kernel=>[3, 3], dilate=>[1, 1], pad=>[1, 1]);
+    my $c = mx->sym->Variable('c', shape=>[5, 21, 32, 32]);
+    my $d = $b + $c;
+    my ($arg_shapes) = $d->infer_shape();
+    _test_shapes($d, $arg_shapes, a=>[5, 10, 32, 32]);
+}
+
+sub test_incomplete_infer_concat
+{
+    my $a = mx->sym->Variable('a', shape=>[0, 10]);
+    my $b = mx->sym->Variable('b', shape=>[0, 5]);
+    my $c = mx->sym->Concat($a, $b, num_args=>2, dim=>1);
+    my $d = mx->sym->Variable('d', shape=>[2, 0]);
+    $d = $d + $c;
+    my ($arg_shapes) = $d->infer_shape();
+    _test_shapes($d, $arg_shapes, a=>[2,10], b=>[2,5], d=>[2,15]);
+}
+
+test_mlp2_infer_shape();
+test_mlp2_infer_error();
+test_backward_infer();
+test_incomplete_infer_elewise();
+test_incomplete_infer_mlp();
+test_incomplete_infer_slicechannel();
+test_incomplete_infer_convolution();
+test_incomplete_infer_concat();
diff --git a/perl-package/AI-MXNet/t/test_init.t b/perl-package/AI-MXNet/t/test_init.t
new file mode 100644
index 000000000000..3fbdf27d8f6d
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_init.t
@@ -0,0 +1,40 @@
+use strict;
+use warnings;
+use Test::More tests => 4;
+use AI::MXNet qw(mx);
+
+sub test_default_init
+{
+    my $data = mx->sym->Variable('data');
+    my $sym  = mx->sym->LeakyReLU(data => $data, act_type => 'prelu');
+    my $mod  = mx->mod->Module($sym);
+    $mod->bind(data_shapes=>[['data', [10,10]]]);
+    $mod->init_params;
+    ok((((values %{ ($mod->get_params)[0] }))[0]->aspdl == 0.25)->all);
+}
+
+sub test_variable_init
+{
+    my $data  = mx->sym->Variable('data');
+    my $gamma = mx->sym->Variable('gamma', init => mx->init->One());
+    my $sym   = mx->sym->LeakyReLU(data => $data, gamma => $gamma, act_type => 'prelu');
+    my $mod   = mx->mod->Module($sym);
+    $mod->bind(data_shapes=>[['data', [10,10]]]);
+    $mod->init_params();
+    ok((((values %{ ($mod->get_params)[0] }))[0]->aspdl == 1)->all);
+}
+
+sub test_aux_init
+{
+    my $data = mx->sym->Variable('data');
+    my $sym  = mx->sym->BatchNorm(data => $data, name => 'bn');
+    my $mod  = mx->mod->Module($sym);
+    $mod->bind(data_shapes=>[['data', [10, 10, 3, 3]]]);
+    $mod->init_params();
+    ok((($mod->get_params)[1]->{bn_moving_var}->aspdl == 1)->all);
+    ok((($mod->get_params)[1]->{bn_moving_mean}->aspdl == 0)->all);
+}
+
+test_default_init();
+test_variable_init();
+test_aux_init();
diff --git a/perl-package/AI-MXNet/t/test_io.t b/perl-package/AI-MXNet/t/test_io.t
new file mode 100644
index 000000000000..3ab0134a693f
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_io.t
@@ -0,0 +1,133 @@
+use AI::MXNet qw(mx);
+use Test::More tests => 31;
+use AI::MXNet::TestUtils qw(same reldiff GetMNIST_ubyte GetCifar10);
+use PDL;
+use PDL::Types;
+use PDL::NiceSlice;
+$|++;
+
+
+sub test_Cifar10Rec()
+{
+    GetCifar10();
+    my $dataiter = mx->io->ImageRecordIter({
+            path_imgrec => "data/cifar/train.rec",
+            mean_img => "data/cifar/cifar10_mean.bin",
+            rand_crop => 0,
+            and_mirror => 0,
+            shuffle => 0,
+            data_shape => [3,28,28],
+            batch_size => 100,
+            preprocess_threads => 4,
+            prefetch_buffer => 1
+    });
+    my @labelcount;
+    my $batchcount = 0;
+    while(my $batch = <$dataiter>)
+    {
+        my $nplabel = $batch->label->[0];
+        for my $i (0..$nplabel->shape->[0]-1)
+        {
+            $labelcount[int($nplabel->at($i)->asscalar)] += 1;
+        }
+    }
+    for my $i (0..9)
+    {
+        ok($labelcount[$i] == 5000);
+    }
+}
+
+sub test_NDArrayIter()
+{
+    my $datas  = ones(PDL::Type->new(6), 2, 2, 1000);
+    my $labels = ones(PDL::Type->new(6), 1, 1000);
+    for my $i (0..999)
+    {
+        $datas(:,:,$i) .= $i / 100;
+        $labels(:,$i) .= $i / 100;
+    }
+    my $dataiter = mx->io->NDArrayIter(
+        data => $datas,
+        label => $labels,
+        batch_size => 128,
+        shuffle => 1,
+        last_batch_handle => 'pad'
+    );
+    my $batchidx = 0;
+    while(<$dataiter>)
+    {
+        $batchidx += 1;
+    }
+    is($batchidx, 8);
+    $dataiter = mx->io->NDArrayIter(
+        data => $datas,
+        label => $labels,
+        batch_size => 128,
+        shuffle => 0,
+        last_batch_handle => 'pad'
+    );
+    $batchidx = 0;
+    my @labelcount;
+    my $i = 0;
+    for my $batch (@{ $dataiter })
+    {
+        my $label = $batch->label->[0];
+        my $flabel = $label->aspdl->flat;
+        ok($batch->data->[0]->aspdl->slice(0,0,'X')->flat->at(0) == $flabel->at(0));
+        for my $i (0..$label->shape->[0]-1)
+        {
+            $labelcount[$flabel->at($i)] += 1;
+        }
+    }
+    for my $i (0..9)
+    {
+        if($i == 0)
+        {
+            ok($labelcount[$i] == 124);
+        }
+        else
+        {
+            ok($labelcount[$i] == 100);
+        }
+    }
+}
+
+sub test_MNISTIter()
+{
+    GetMNIST_ubyte();
+
+    my $batch_size = 100;
+    my $train_dataiter = mx->io->MNISTIter({
+            image => "data/train-images-idx3-ubyte",
+            label => "data/train-labels-idx1-ubyte",
+            data_shape => [784],
+            batch_size => $batch_size,
+            shuffle => 1,
+            flat => 1,
+            silent => 0,
+            seed => 10
+    });
+    # test_loop
+    my $nbatch = 60000 / $batch_size;
+    my $batch_count = 0;
+    for my $batch (@{ $train_dataiter})
+    {
+        $batch_count += 1;
+    }
+    ok($nbatch == $batch_count);
+    # test_reset
+    $train_dataiter->reset();
+    $train_dataiter->iter_next();
+    my $label_0 = $train_dataiter->getlabel->aspdl->flat;
+    $train_dataiter->iter_next;
+    $train_dataiter->iter_next;
+    $train_dataiter->iter_next;
+    $train_dataiter->reset;
+    $train_dataiter->iter_next;
+    my $label_1 = $train_dataiter->getlabel->aspdl->flat;
+    ok(sum($label_0 - $label_1) == 0);
+}
+
+test_NDArrayIter();
+test_MNISTIter();
+test_Cifar10Rec();
diff --git a/perl-package/AI-MXNet/t/test_io_image.t b/perl-package/AI-MXNet/t/test_io_image.t
new file mode 100644
index 000000000000..ae56032df854
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_io_image.t
@@ -0,0 +1,30 @@
+use strict;
+use warnings;
+use Test::More tests => 1;
+use AI::MXNet qw(mx);
+use Time::HiRes qw(time);
+
+sub run_imageiter
+{
+    my ($path_rec, $n, $batch_size) = @_;
+    $batch_size //= 32;
+    my $data = mx->img->ImageIter(
+        batch_size=>$batch_size,
+        data_shape=>[3, 224, 224],
+        path_imgrec=>$path_rec,
+        kwargs => { rand_crop=>1,
+        rand_resize=>1,
+        rand_mirror=>1 }
+    );
+    $data->reset();
+    my $tic = time;
+    for my $i (1..$n)
+    {
+        $data->next;
+        mx->nd->waitall;
+        warn("average speed after iteration $i is " . $batch_size*$i/(time - $tic) . " samples/sec");
+    }
+}
+
+run_imageiter('data/cifar/test.rec', 20);
+ok(1);
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_kvstore.t b/perl-package/AI-MXNet/t/test_kvstore.t
new file mode 100644
index 000000000000..e6e1b799ffe9
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_kvstore.t
@@ -0,0 +1,154 @@
+use strict;
+use warnings;
+use Test::More tests => 38;
+use AI::MXNet qw(mx);
+
+my $shape = [4, 4];
+my $keys  = [5,7,9];
+
+sub init_kv
+{
+    # init kv
+    my $kv = mx->kv->create();
+    # single
+    $kv->init(3, mx->nd->zeros($shape));
+    # list
+    $kv->init($keys, [map { mx->nd->zeros($shape) } 0..@$keys-1]);
+    return $kv;
+}
+
+sub check_diff_to_scalar
+{
+    # assert A == x
+    my ($A, $x) = @_;
+    ok(($A - $x)->aspdl->abs->sum == 0);
+}
+
+sub test_single_kv_pair
+{
+    # single key-value pair push & pull
+    my $kv = init_kv();
+    $kv->push(3, mx->nd->ones($shape));
+    my $val = mx->nd->empty($shape);
+    $kv->pull(3, out => $val);
+    check_diff_to_scalar($val, 1);
+}
+
+sub test_init
+{
+    my $kv = mx->kv->create();
+    $kv->init(3, mx->nd->ones($shape)*4);
+    my $a = mx->nd->zeros($shape);
+    $kv->pull(3, out=>$a);
+    check_diff_to_scalar($a, 4);
+}
+
+sub test_list_kv_pair
+{
+    # list key-value pair push & pull
+    my $kv = init_kv();
+    $kv->push($keys, [map {mx->nd->ones($shape)*4} 0..@$keys-1]);
+    my $val = [map { mx->nd->empty($shape) } 0..@$keys-1];
+    $kv->pull($keys, out => $val);
+    for my $v (@$val)
+    {
+        check_diff_to_scalar($v, 4);
+    }
+}
+
+sub test_aggregator
+{
+    # aggregate value on muliple devices
+
+    my $kv = init_kv();
+
+    # devices
+    my $num_devs = 4;
+    my $devs = [map { mx->cpu($_) } 0..$num_devs-1];
+
+    # single
+    my $vals = [map { mx->nd->ones($shape, ctx => $_) } @$devs];
+
+    $kv->push(3, $vals);
+    $kv->pull(3, out => $vals);
+
+    for my $v (@$vals)
+    {
+        check_diff_to_scalar($v, $num_devs);
+    }
+    # list
+
+    $vals = [map { [map { mx->nd->ones($shape, ctx => $_)*2 } @$devs] } 0..@$keys-1];
+    $kv->push($keys, $vals);
+    $kv->pull($keys, out => $vals);
+
+    for my $vv (@{ $vals })
+    {
+        for my $v (@{ $vv })
+        {
+            check_diff_to_scalar($v, $num_devs * 2);
+        }
+    }
+}
+
+sub updater
+{
+    my ($key, $recv, $local) = @_;
+    $local += $recv;
+}
+
+sub test_updater
+{
+    my ($dev) = @_;
+    $dev //= 'cpu';
+    my $kv = init_kv();
+    $kv->_set_updater(\&updater);
+
+    # devices
+    my $num_devs = 4;
+    my $devs = [map { mx->$dev($_) } 0..$num_devs-1];
+
+    # single
+    my $vals = [map { mx->nd->ones($shape, ctx => $_) } @$devs];
+
+    $kv->push(3, $vals);
+    $kv->pull(3, out => $vals);
+
+    for my $v (@$vals)
+    {
+        check_diff_to_scalar($v, $num_devs);
+    }
+
+    # list
+    $vals = [map { [map { mx->nd->ones($shape, ctx => $_) } @$devs] } 0..@$keys-1];
+
+    my $num_push = 10;
+    for my $i (0..$num_push-1)
+    {
+        $kv->push($keys, $vals);
+    }
+
+    $kv->pull($keys, out => $vals);
+
+    for my $vv (@{ $vals })
+    {
+        for my $v (@{ $vv })
+        {
+            check_diff_to_scalar($v, $num_devs * $num_push);
+        }
+    }
+}
+
+sub test_get_type
+{
+    my $kvtype = 'local_allreduce_cpu';
+    my $kv = mx->kv->create($kvtype);
+    is($kv->type, $kvtype);
+}
+
+test_init();
+test_get_type();
+test_single_kv_pair();
+test_list_kv_pair();
+test_aggregator();
+test_updater();
diff --git a/perl-package/AI-MXNet/t/test_model_parallel.t b/perl-package/AI-MXNet/t/test_model_parallel.t
new file mode 100644
index 000000000000..e20b208029b5
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_model_parallel.t
@@ -0,0 +1,62 @@
+use strict;
+use warnings;
+use Test::More tests => 3;
+use AI::MXNet qw(mx);
+use AI::MXNet::TestUtils qw(reldiff);
+use AI::MXNet::Base;
+
+sub test_chain
+{
+    my $n = 2;
+    my $data1 = mx->sym->Variable('data1');
+    my $data2 = mx->sym->Variable('data2');
+    my $net;
+    {
+        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'dev1');
+        $net = $data1 + $data2;
+        $net = $net * 3;
+    }
+
+    {
+        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'dev2');
+        $net = $net + $data1;
+    }
+    my $arr;
+    my $arr_grad;
+    my $shape = [4, 5];
+    {
+        local($mx::Context) = mx->Context(mx->cpu(0));
+        $arr   = [map { mx->nd->empty($shape) } 0..$n-1];
+        $arr_grad = [map { mx->nd->empty($shape) } 0..$n-1];
+    }
+
+    my $exec1 = $net->bind(
+        ctx          => mx->cpu(),
+        args         => $arr,
+        args_grad    => $arr_grad,
+        group2ctx    => { dev1 => mx->cpu(0), dev2 => mx->cpu(1) }
+    );
+    $arr->[0] .= 1;
+    $arr->[1] .= 2;
+    my $arr2 = [map { $_->copyto(mx->cpu()) } @$arr];
+    my $arr_grad2 = [map { $_->copyto(mx->cpu()) } @$arr_grad];
+    my $exec2 = $net->bind(
+        ctx       => mx->cpu(),
+        args      => $arr2,
+        args_grad => $arr_grad2
+    );
+
+    $exec1->forward(1);
+    $exec2->forward(1);
+    ok(reldiff($exec1->outputs->[0]->aspdl, $exec2->outputs->[0]->aspdl) < 1e-6);
+    my $out_grad = mx->nd->empty($shape, ctx => mx->cpu(1));
+    $out_grad .= 1;
+    $exec1->backward([$out_grad]);
+    $exec2->backward([$out_grad->copyto(mx->cpu())]);
+    zip(sub {
+        my ($a, $b) = @_;
+        ok(reldiff($a->aspdl, $b->aspdl) < 1e-6);
+    }, $arr_grad, $arr_grad2);
+}
+
+test_chain();
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
new file mode 100644
index 000000000000..b106762c9623
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -0,0 +1,225 @@
+use strict;
+use warnings;
+use Test::More tests => 15;
+use AI::MXNet qw(mx);
+use AI::MXNet::Base;
+use AI::MXNet::TestUtils qw(almost_equal);
+
+sub test_module_layout
+{
+    my $sym = mx->sym->Variable('data');
+    $sym = mx->sym->Activation(data=>$sym, act_type=>'relu', __layout__=>'TNC');
+
+    my $dshape = [3, 8, 7];
+    my $mod = mx->mod->Module(
+        $sym,
+        data_names=>['data'],
+        context=>[mx->cpu(0), mx->cpu(1)]
+    );
+    $mod->bind(
+        data_shapes=>[mx->io->DataDesc('data', $dshape, layout=>'TNC')]
+    );
+    $mod->init_params();
+    $mod->forward(
+        mx->io->DataBatch(
+            data=>[mx->nd->ones($dshape)]
+        ),
+        is_train => 1
+    );
+    $mod->backward([mx->nd->ones($dshape)]);
+    is_deeply($mod->get_outputs()->[0]->shape, $dshape);
+
+    my $hdshape = [3, 4, 7];
+    for my $x (@{ $mod->get_outputs(0)->[0] })
+    {
+        is_deeply($x->shape, $hdshape);
+    }
+}
+
+sub test_save_load
+{
+    my $dict_equ = sub {
+        is_deeply([sort keys %$a], [sort keys %$b]);
+        for my $k (keys %$a)
+        {
+            ok(($a->{$k}->aspdl == $b->{$k}->aspdl)->all);
+        }
+    };
+    my $sym = mx->sym->Variable('data');
+    $sym = mx->sym->FullyConnected($sym, num_hidden=>100);
+
+    # single device
+    my $mod = mx->mod->Module($sym, data_names=>['data']);
+    $mod->bind(data_shapes=>[['data', [10, 10]]]);
+    $mod->init_params();
+    $mod->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
+    $mod->update();
+    $mod->save_checkpoint('test', 0, 1);
+
+    my $mod2 = mx->mod->Module->load('test', 0, 1, data_names=>['data']);
+    $mod2->bind(data_shapes=>[['data', [10, 10]]]);
+    $mod2->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
+    is($mod->_symbol->tojson(), $mod2->_symbol->tojson());
+    $dict_equ->(($mod->get_params())[0], ($mod2->get_params())[0]);
+    $dict_equ->($mod->_updater->states, $mod2->_updater->states);
+
+    # multi device
+    $mod = mx->mod->Module($sym, data_names=>['data'], context=>[mx->cpu(0), mx->cpu(1)]);
+    $mod->bind(data_shapes=>[['data', [10, 10]]]);
+    $mod->init_params();
+    $mod->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
+    $mod->update();
+    $mod->save_checkpoint('test', 0, 1);
+
+    $mod2 = mx->mod->Module->load('test', 0, 1, data_names=>['data']);
+    $mod2->bind(data_shapes=>[['data', [10, 10]]]);
+    $mod2->init_optimizer(optimizer_params=>{learning_rate => 0.1, momentum => 0.9});
+    is($mod->_symbol->tojson(), $mod2->_symbol->tojson());
+    $dict_equ->(($mod->get_params())[0], ($mod2->get_params())[0]);
+    $dict_equ->($mod->_kvstore->_updater->states, $mod2->_updater->states);
+    unlink('test-0000.params');
+    unlink('test-0000.states');
+    unlink('test-symbol.json');
+}
+
+
+sub test_module_reshape
+{
+    my $data = mx->sym->Variable('data');
+    my $sym  = mx->sym->FullyConnected($data, num_hidden=>20, name=>'fc');
+
+    my $dshape = [7, 20];
+    my $mod = mx->mod->Module($sym, data_names=>['data'], context=>[mx->cpu(0), mx->cpu(1)]);
+    $mod->bind(data_shapes=>[['data', $dshape]]);
+    $mod->init_params();
+    $mod->init_optimizer(optimizer_params=>{learning_rate => 1});
+
+    $mod->forward(
+        mx->io->DataBatch(
+            data=>[mx->nd->ones($dshape)]
+        ),
+        is_train => 1
+    );
+    $mod->backward([mx->nd->ones($dshape)]);
+    $mod->update();
+    is_deeply($mod->get_outputs()->[0]->shape, $dshape);
+    ok((($mod->get_params())[0]{fc_bias}->aspdl == -1)->all);
+
+    $dshape = [14, 20];
+    $mod->reshape(data_shapes=>[['data', $dshape]]);
+    $mod->forward(
+        mx->io->DataBatch(
+            data=>[mx->nd->ones($dshape)]
+        ),
+        is_train => 1
+    );
+    $mod->backward([mx->nd->ones($dshape)]);
+    $mod->update();
+    is_deeply($mod->get_outputs()->[0]->shape, $dshape);
+    ok((($mod->get_params())[0]{fc_bias}->aspdl == -3)->all);
+}
+
+
+sub test_module_states
+{
+    my $stack = mx->rnn->SequentialRNNCell();
+    for my $i (0..1)
+    {
+        $stack->add(mx->rnn->LSTMCell(num_hidden=>20, prefix=>"lstm_l${i}_"));
+    }
+    my $begin_state = $stack->begin_state(func=>mx->sym->can('Variable'));
+    my (undef, $states) = $stack->unroll(10, begin_state=>$begin_state, inputs=>mx->sym->Variable('data'));
+
+    my $state_names = [map { $_->name } @$begin_state];
+    my $mod = mx->mod->Module(
+        mx->sym->Group($states), context=>[mx->cpu(0), mx->cpu(1)],
+        state_names=>$state_names
+    );
+    $mod->bind(data_shapes=>[['data', [5, 10]]], for_training=>0);
+    $mod->init_params();
+    my $batch = mx->io->DataBatch(data=>[mx->nd->zeros([5, 10])], label=>[]);
+
+    $mod->set_states(value=>1);
+    $mod->forward($batch);
+    my $out = $mod->get_outputs(0);
+    my $out1 = $mod->get_outputs(1);
+
+    $mod->set_states(states=>$out);
+    $mod->forward($batch);
+    my $out2 = $mod->get_outputs(1);
+
+    zip(sub {
+        my ($x1, $x2) = @_;
+        ok(not almost_equal($x1->aspdl, $x2->aspdl, 1e-3));
+    }, $out1, $out2);
+}
+
+sub test_module_switch_bucket
+{
+    my $vocab_dim  = 5000;
+    my $num_hidden = 100;
+    my $num_embedding = 100;
+    my $num_layer = 2;
+    my $default_key = 10;
+    my $test_key = 5;
+    my $batch_size = 32;
+    my $contexts = [mx->cpu(0)];
+    my $initializer = mx->init->Xavier(factor_type=>"in", magnitude=>2.34);
+
+    #generate symbols for an LSTM network
+    my $gen_sym = sub {
+        my $seq_len = shift;
+        my $data  = mx->sym->Variable('data');
+        my $label = mx->sym->Variable('softmax_label');
+        my $embed = mx->sym->Embedding(data=>$data, input_dim=>$vocab_dim,
+                                 output_dim=>$num_embedding, name=>'embed');
+        my $stack = mx->rnn->SequentialRNNCell();
+        for my $i (0..$num_layer-1)
+        {
+            $stack->add(mx->rnn->LSTMCell(num_hidden=>$num_hidden, prefix=>"lstm_l${i}_"));
+        }
+        my ($outputs, $states) = $stack->unroll($seq_len, inputs=>$embed, merge_outputs=>1);
+
+        my $pred = mx->sym->Reshape($outputs, shape=>[-1, $num_hidden]);
+        $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>$vocab_dim, name=>'pred');
+
+        $label = mx->sym->Reshape($label, shape=>[-1]);
+        $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax');
+
+        return ($pred, ['data'], ['softmax_label']);
+    };
+    my $create_bucketing_module = sub { my $key = shift;
+        my $model = mx->mod->BucketingModule(
+            sym_gen             => $gen_sym,
+            default_bucket_key  => $key,
+            context             => $contexts
+        );
+        $model->bind(data_shapes=>[['data', [$batch_size, $key]]],
+                    label_shapes=>[['softmax_label', [$batch_size, $key]]]
+        );
+        $model->init_params(initializer=>$initializer);
+        return $model;
+    };
+    #initialize the bucketing module with the default bucket key
+    my $bucketing_model = $create_bucketing_module->($default_key);
+    #switch to test_key
+    $bucketing_model->switch_bucket(
+        bucket_key   => $test_key,
+        data_shapes  => [['data', [$batch_size, $test_key]]],
+        label_shapes => [['softmax_label', [$batch_size, $test_key]]]
+    );
+
+    delete $bucketing_model->_buckets->{$test_key};
+
+    $bucketing_model->switch_bucket(
+        bucket_key   => $test_key,
+        data_shapes  => [['data', [$batch_size, $test_key]]],
+        label_shapes => [['softmax_label', [$batch_size, $test_key]]]
+    );
+}
+
+test_module_switch_bucket();
+test_module_layout();
+test_module_states();
+test_module_reshape();
+test_save_load();
diff --git a/perl-package/AI-MXNet/t/test_multi_device_exec.t b/perl-package/AI-MXNet/t/test_multi_device_exec.t
new file mode 100644
index 000000000000..87ca25778c92
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_multi_device_exec.t
@@ -0,0 +1,57 @@
+use strict;
+use warnings;
+use Test::More tests => 10;
+use AI::MXNet qw(mx);
+use AI::MXNet::Base;
+
+sub test_ctx_group
+{
+    my ($data, $fc1, $act1);
+    {
+        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage1');
+        $data = mx->symbol->Variable('data');
+        $fc1  = mx->symbol->FullyConnected(data => $data, name=>'fc1', num_hidden=>128);
+        $act1 = mx->symbol->Activation(data => $fc1, name=>'relu1', act_type=>"relu");
+    }
+    my %set_stage1 = map { $_ => 1 } @{ $act1->list_arguments };
+
+    my ($fc2, $act2, $fc3, $mlp);
+    {
+        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage2');
+        $fc2  = mx->symbol->FullyConnected(data => $act1, name => 'fc2', num_hidden => 64);
+        $act2 = mx->symbol->Activation(data => $fc2, name=>'relu2', act_type=>"relu");
+        $fc3  = mx->symbol->FullyConnected(data => $act2, name=>'fc3', num_hidden=>10);
+        $fc3  = mx->symbol->BatchNorm($fc3);
+        $mlp  = mx->symbol->SoftmaxOutput(data => $fc3, name => 'softmax');
+    }
+    my %set_stage2 = map { $_ => 1 } @{ $mlp->list_arguments };
+    for my $k (keys %set_stage1)
+    {
+        delete $set_stage2{$k};
+    }
+
+    my $group2ctx = {
+        stage1 => mx->cpu(1),
+        stage2 => mx->cpu(2)
+    };
+
+    my $texec = $mlp->simple_bind(
+        ctx       => mx->cpu(0),
+        group2ctx => $group2ctx,
+        shapes    => { data => [1,200] }
+    );
+
+    zip(sub {
+        my ($arr, $name) = @_;
+        if(exists $set_stage1{ $name })
+        {
+            ok($arr->context == $group2ctx->{stage1});
+        }
+        else
+        {
+            ok($arr->context == $group2ctx->{stage2});
+        }
+    }, $texec->arg_arrays, $mlp->list_arguments());
+}
+
+test_ctx_group();
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
new file mode 100644
index 000000000000..1248b166d062
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -0,0 +1,315 @@
+package PerlAdam;
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use Mouse;
+use AI::MXNet::Function::Parameters;
+extends 'AI::MXNet::Optimizer';
+has 'beta1' => (is => 'rw', default => 0.9);
+has 'beta2' => (is => 'rw', default => 0.999);
+has 'epsilon' => (is => 'rw', default => 1e-8);
+has 'rescale_grad' => (is => 'rw', default => 1);
+has 'decay_factor' => (is => 'rw', default => (1-1e-8));
+around BUILDARGS => \&init;
+
+func init($code, $class, %kwargs)
+{
+    return $class->$code(learning_rate => 0.001, wd => 0.9, %kwargs);
+}
+
+=begin
+        Create additional optimizer state: mean, variance
+
+        Parameters
+        ----------
+        weight : NDArray
+        The weight data
+=cut
+
+method create_state($index, $weight)
+{
+    return [
+            mx->nd->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype),  # mean
+            mx->nd->zeros($weight->shape, ctx => $weight->context, dtype => $weight->dtype)   # variance
+    ]; 
+}
+
+=begin
+        Update the parameters.
+
+        Parameters
+        ----------
+        index : int
+        An unique integer key used to index the parameters
+
+        weight : NDArray
+        weight ndarray
+
+        grad : NDArray
+        grad ndarray
+
+        state : NDArray or other objects returned by init_state
+        The auxiliary state used in optimization.
+=cut
+
+method update($index, $weight, $grad, $state)
+{
+    my $lr = $self->_get_lr($index);
+    $self->_update_count($index);
+    my $t = $self->_index_update_count->{$index};
+    my ($mean, $variance) = @$state;
+    my $wd = $self->_get_wd($index);
+    $grad = $grad * $self->rescale_grad + $wd * $weight;
+    if($self->clip_gradient)
+    {
+        mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient, { out => $grad });
+    }
+    $mean *= $self->beta1;
+    $mean += $grad * (1 - $self->beta1);
+
+    $variance *= $self->beta2;
+    $variance += (1 - $self->beta2) * mx->nd->square($grad, { out => $grad });
+
+    my $coef1 = 1 - $self->beta1**$t;
+    my $coef2 = 1 - $self->beta2**$t;
+    $lr *= sqrt($coef2)/$coef1;
+    $weight -= $lr*$mean/(mx->nd->sqrt($variance) + $self->epsilon);
+}
+
+=head
+
+    RMSProp optimizer of Tieleman & Hinton, 2012,
+
+    For centered=False, the code follows the version in
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012
+
+    For centered=True, the code follows the version in
+    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
+
+    Parameters
+    ----------
+    learning_rate : float, optional
+        Step size.
+        Default value is set to 0.001.
+    gamma1: float, optional
+        decay factor of moving average for gradient, gradient^2.
+        Default value is set to 0.9.
+    gamma2: float, optional
+        "momentum" factor.
+        Default value if set to 0.9.
+        Only used if centered=True
+    epsilon : float, optional
+        Default value is set to 1e-8.
+    centered : boolean, optional
+        Use Graves or Tielemans & Hintons version of RMSProp
+    wd : float, optional
+        L2 regularization coefficient add to all the weights
+    rescale_grad : float, optional
+        rescaling factor of gradient.
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
+    clip_weights : float, optional
+        clip weights in range [-clip_weights, clip_weights]
+=cut
+
+package PerlRMSProp;
+use Mouse;
+extends 'AI::MXNet::Optimizer';
+has '+learning_rate' => (default => 0.001);
+has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
+has 'centered'       => (is => "ro", isa => "Bool", default => 0);
+has 'clip_weights'   => (is => "ro", isa => "Num");
+
+# For centered=False: n
+# For centered=True: n, g, delta
+method create_state(Index $index, AI::MXNet::NDArray $weight)
+{
+    return [
+            $self->centered
+            ? (
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                ),  # n
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                ),  # g
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                )
+            )   # delta
+            : (
+                AI::MXNet::NDArray->zeros(
+                    $weight->shape,
+                    ctx => $weight->context
+                ),  # n
+            )
+    ];
+}
+
+method update($index, $weight, $grad, $state)
+{
+    my $lr = $self->_get_lr($index);
+    my $wd = $self->_get_wd($index);
+    $self->_update_count($index);
+    $grad = $grad * $self->rescale_grad + $wd * $weight;
+    if(not $self->centered)
+    {
+        my ($n) = @$state;
+        if(defined $self->clip_gradient)
+        {
+            $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
+        }
+        $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
+        $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon);
+    }
+    else
+    {
+        my ($n, $g, $delta) = @$state;
+        if(defined $self->clip_gradient)
+        {
+            $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
+        }
+        $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
+        $g .= (1 - $self->gamma1) * $grad + $self->gamma1 * $g;
+        $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g) + $self->epsilon);
+        $weight += $delta;
+    }
+    if($self->clip_weights)
+    {
+        mx->nd->clip($weight, -$self->clip_weights, $self->clip_weights, { out => $weight });
+    }
+}
+
+package main;
+use Test::More tests => 162;
+use AI::MXNet::Base;
+use PDL::NiceSlice;
+use AI::MXNet::TestUtils qw(same reldiff almost_equal);
+use AI::MXNet::Function::Parameters;
+
+func compare_optimizer($opt1, $opt2, $shape)
+{
+    my $w1 = mx->random->uniform({shape => $shape});
+    my $g1 = mx->random->uniform({shape => $shape});
+
+    my $w2 = $w1->copyto(mx->cpu());
+    my $g2 = $g1->copyto(mx->cpu());
+
+    my $state1 = $opt1->create_state(0, $w1);
+    my $state2 = $opt2->create_state(0, $w2);
+    zip(
+        sub {
+            my ($s1, $s2) = @_;
+            ok(same($s1->aspdl, $s2->aspdl))
+        },
+        $state1, $state2
+    );
+
+    $opt1->update(0, $w1, $g1, $state1);
+    $opt2->update(0, $w2, $g2, $state2);
+    zip(
+        sub {
+            my ($s1, $s2) = @_;
+            ok(reldiff($s1->aspdl, $s2->aspdl) < 1e-5)
+        },
+        $state1, $state2
+    );
+    ok(reldiff($w1->aspdl, $w2->aspdl) < 1e-5);
+}
+
+func test_adam()
+{
+    mx->random->seed(0);
+    my $opt1 = 'PerlAdam';
+    my $opt2 = 'AI::MXNet::Adam';
+    my $shape = [3, 4, 5];
+    my @kwargs = ({},
+              {'clip_gradient'=> 0.5},
+              {'clip_gradient'=> 0.1},
+              {'rescale_grad'=> 0.1});
+    for my $kwarg (@kwargs)
+    {
+        compare_optimizer($opt1->new(%$kwarg), $opt2->new(wd => 0.9, %$kwarg), $shape);
+    }
+}
+
+func test_rms()
+{
+    mx->random->seed(0);
+    my $opt1 = 'PerlRMSProp';
+    my $opt2 = 'AI::MXNet::RMSProp';
+    my $shape = [3, 4, 5];
+    my @kwargs = ({},
+              {clip_gradient => 0.5},
+              {clip_gradient => 0.4, rescale_grad => 0.14},
+              {rescale_grad  => 0.8},
+              {clip_gradient => 0.5, wd => 0.07},
+              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03},
+              {rescale_grad  => 0.8, wd => 0.05},
+              {centered => 1},
+              {clip_gradient => 0.5, centered => 1},
+              {clip_gradient => 0.4, rescale_grad => 0.14, centered => 1},
+              {rescale_grad  => 0.8, centered => 1},
+              {clip_gradient => 0.5, wd => 0.07, centered => 1},
+              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03, centered => 1},
+              {rescale_grad  => 0.8, wd => 0.05, centered => 1},
+              {clip_gradient => 0.5, clip_weights => 0.01},
+              {clip_gradient => 0.4, rescale_grad => 0.14, clip_weights => 0.01},
+              {rescale_grad  => 0.8, clip_weights => 0.01},
+              {clip_gradient => 0.5, wd => 0.07, clip_weights => 0.01},
+              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03, clip_weights => 0.01},
+              {rescale_grad  => 0.8, wd => 0.05, clip_weights => 0.01},
+              {centered => 1, clip_weights => 0.01},
+              {clip_gradient => 0.5, centered => 1, clip_weights => 0.01},
+              {clip_gradient => 0.4, rescale_grad => 0.14, centered => 1, clip_weights => 0.01},
+              {rescale_grad  => 0.8, centered => 1, clip_weights => 0.01},
+              {clip_gradient => 0.5, wd => 0.07, centered => 1, clip_weights => 0.01},
+              {clip_gradient => 0.4, rescale_grad => 0.14, wd => 0.03, centered => 1, clip_weights => 0.01},
+              {rescale_grad  => 0.8, wd => 0.05, centered => 1, clip_weights => 0.01});
+    for my $kwarg (@kwargs)
+    {
+        compare_optimizer($opt1->new(%$kwarg), $opt2->new(%$kwarg), $shape);
+    }
+}
+
+func test_lr_wd_mult()
+{
+    my $data = mx->sym->Variable('data');
+    my $bias = mx->sym->Variable('fc1_bias', lr_mult => 1.0);
+    my $fc1  = mx->sym->FullyConnected({ data => $data, bias => $bias, name => 'fc1', num_hidden => 10, lr_mult => 0 });
+    my $fc2  = mx->sym->FullyConnected({ data => $fc1, name => 'fc2', num_hidden => 10, wd_mult => 0.5 });
+
+    my $mod = mx->mod->new(symbol => $fc2, label_names => undef);
+    $mod->bind(data_shapes => [['data', [5,10]]]);
+    $mod->init_params(initializer => mx->init->Uniform(scale => 1.0));
+    $mod->init_optimizer(optimizer_params => { learning_rate => "1.0" });
+    my %args1 = %{ ($mod->get_params())[0] };
+    for my $k (keys %args1)
+    {
+        $args1{$k} = $args1{$k}->aspdl;
+    }
+    $mod->forward(AI::MXNet::DataBatch->new(data=>[mx->random->uniform({low=>-1.0, high=>1.0, shape=>[5,10]})], label=>undef), is_train=>1);
+    $mod->backward($mod->get_outputs());
+    $mod->update();
+    my %args2 = %{ ($mod->get_params())[0] };
+    for my $k (keys %args2)
+    {
+        $args2{$k} = $args2{$k}->aspdl;
+    }
+    is_deeply($mod->_p->_optimizer->lr_mult, { fc1_bias => 1, fc1_weight => 0 }, "lr_mult");
+    is_deeply($mod->_p->_optimizer->wd_mult, { fc2_bias => 0.5, fc2_weight => 0.5, fc1_bias => 0, }, "wd_mult");
+    ok(almost_equal($args1{fc1_weight}, $args2{fc1_weight}, 1e-10), "fc1_weight");
+    ok(!almost_equal($args1{fc1_bias}, $args2{fc1_bias}, 1e-1), "fc1_bias");
+    ok(!almost_equal($args1{fc2_weight}, $args2{fc2_weight}, 1e-1), "fc2_weight");
+}
+
+test_adam();
+test_rms();
+test_lr_wd_mult();
+
diff --git a/perl-package/AI-MXNet/t/test_random.t b/perl-package/AI-MXNet/t/test_random.t
new file mode 100644
index 000000000000..7d7ef192fd0b
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_random.t
@@ -0,0 +1,66 @@
+use strict;
+use warnings;
+use Test::More tests => 11;
+use AI::MXNet qw(mx);
+use AI::MXNet::TestUtils qw(same);
+
+sub check_with_device
+{
+    my ($device)     = @_;
+    my ($a, $b)      = (-10, 10);
+    my ($mu, $sigma) = (10, 2);
+    my $shape        = [100, 100];
+    mx->random->seed(128);
+    my $ret1 = mx->random->normal($mu, $sigma, $shape, { ctx => $device });
+    my $un1  = mx->random->uniform($a, $b, $shape, { ctx => $device });
+    mx->random->seed(128);
+    my $ret2 = mx->random->normal($mu, $sigma, $shape, { ctx => $device });
+    my $un2  = mx->random->uniform($a, $b, $shape, { ctx => $device });
+    ok(same($ret1->aspdl, $ret2->aspdl));
+    ok(same($un1->aspdl, $un2->aspdl));
+    ok(abs($ret1->aspdl->avg - $mu) < 0.1);
+    ok(abs(($ret1->aspdl->stats)[6] - $sigma) < 0.1);
+    ok(abs($un1->aspdl->avg - ($a+$b)/2) < 0.1);
+}
+
+sub check_symbolic_random
+{
+    my ($dev) = @_;
+    my ($a, $b) = (-10, 10);
+    my ($mu, $sigma) = (10, 2);
+    my $shape = [100, 100];
+    my $X = mx->sym->Variable("X");
+    my $Y = mx->sym->uniform(low=>$a, high=>$b, shape=>$shape) + $X;
+    my $x = mx->nd->zeros($shape, ctx=>$dev);
+    my $xgrad = mx->nd->zeros($shape, ctx=>$dev);
+    my $yexec = $Y->bind(ctx => $dev, args => {X => $x}, args_grad => {X => $xgrad});
+    mx->random->seed(128);
+    $yexec->forward(1);
+    $yexec->backward($yexec->outputs->[0]);
+    my $un1 = ($yexec->outputs->[0] - $x)->copyto($dev);
+    ok(same($xgrad->aspdl, $un1->aspdl));
+    mx->random->seed(128);
+    $yexec->forward;
+    my $un2 = ($yexec->outputs->[0] - $x)->copyto($dev);
+    ok(same($un1->aspdl, $un2->aspdl));
+    ok(abs($un1->aspdl->avg - ($a+$b)/2) < 0.1);
+
+    $Y = mx->sym->normal(loc=>$mu, scale=>$sigma, shape=>$shape);
+    $yexec = $Y->simple_bind(ctx => $dev);
+    mx->random->seed(128);
+    $yexec->forward;
+    my $ret1 = $yexec->outputs->[0]->copyto($dev);
+    mx->random->seed(128);
+    my $ret2 = mx->random->normal($mu, $sigma, $shape);
+    ok(same($ret1->aspdl, $ret2->aspdl));
+    ok(abs($ret1->aspdl->avg - $mu) < 0.1);
+    ok(abs(($ret1->aspdl->stats)[6] - $sigma) < 0.1);
+}
+
+sub test_random
+{
+    check_with_device(mx->cpu);
+    check_symbolic_random(mx->cpu);
+}
+
+test_random();
diff --git a/perl-package/AI-MXNet/t/test_recordio.t b/perl-package/AI-MXNet/t/test_recordio.t
new file mode 100644
index 000000000000..daae20ad6774
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_recordio.t
@@ -0,0 +1,73 @@
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use Test::More tests => 1711;
+use File::Temp qw/tempfile/;
+use PDL;
+
+sub test_recordio
+{
+    my ($fd, $frec) = tempfile();
+    my $N = 255;
+
+    my $writer = mx->recordio->MXRecordIO($frec, 'w');
+    for my $i (0..$N-1)
+    {
+        $writer->write(chr($i));
+    }
+    undef $writer;
+
+    my $reader = mx->recordio->MXRecordIO($frec, 'r');
+    for my $i (0..$N-1)
+    {
+        my $res = $reader->read;
+        is($res, chr($i));
+    }
+}
+
+sub test_indexed_recordio
+{
+    my ($fi, $fidx) = tempfile();
+    my ($fr, $frec) = tempfile();
+    my $N = 255;
+
+    my $writer = mx->recordio->MXIndexedRecordIO($fidx, $frec, 'w');
+    for my $i (0..$N-1)
+    {
+        $writer->write_idx($i, chr($i));
+    }
+    undef $writer;
+
+    my $reader = mx->recordio->MXIndexedRecordIO($fidx, $frec, 'r');
+    my @keys = @{ $reader->keys };
+    is_deeply([sort {$a <=> $b} @keys], [0..$N-1]);
+    @keys = List::Util::shuffle(@keys);
+    for my $i (@keys)
+    {
+        my $res = $reader->read_idx($i);
+        is($res, chr($i));
+    }
+}
+
+sub test_recordio_pack_label
+{
+    my $N = 25;
+    my @ascii_uppercase_and_digits = ('A'..'Z', 0..9);
+    for my $i (1..$N-1)
+    {
+        for my $j (0..$N-1)
+        {
+            my $content = join('', map { $ascii_uppercase_and_digits[int(rand(36))] } 0..$j-1);
+            my $label = mx->nd->array(random($i), dtype => 'float32')->aspdl;
+            my $header = [0, $label, 0, 0];
+            my $s = mx->recordio->pack($header, $content);
+            my ($rheader, $rcontent) = mx->recordio->unpack($s);
+            ok(($label == $rheader->label)->all);
+            ok($content eq $rcontent);
+        }
+    }
+}
+
+test_recordio_pack_label();
+test_recordio();
+test_indexed_recordio();
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_rnn.t b/perl-package/AI-MXNet/t/test_rnn.t
new file mode 100644
index 000000000000..ab195e6ffe07
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_rnn.t
@@ -0,0 +1,118 @@
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use PDL;
+use Test::More tests => 36;
+
+sub test_rnn
+{
+    my $cell = mx->rnn->RNNCell(100, prefix=>'rnn_');
+    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort keys %{$cell->params->_params}], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
+    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+sub test_lstm
+{
+    my $cell = mx->rnn->LSTMCell(100, prefix=>'rnn_', forget_bias => 1);
+    my($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort keys %{$cell->params->_params}], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
+    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+sub test_lstm_forget_bias
+{
+    my $forget_bias = 2;
+    my $stack = mx->rnn->SequentialRNNCell();
+    $stack->add(mx->rnn->LSTMCell(100, forget_bias=>$forget_bias, prefix=>'l0_'));
+    $stack->add(mx->rnn->LSTMCell(100, forget_bias=>$forget_bias, prefix=>'l1_'));
+
+    my $dshape = [32, 1, 200];
+    my $data   = mx->sym->Variable('data');
+
+    my ($sym) = $stack->unroll(1, inputs => $data, merge_outputs => 1);
+    my $mod = mx->mod->Module($sym, context => mx->cpu(0));
+    $mod->bind(data_shapes=>[['data', $dshape]]);
+
+    $mod->init_params();
+    my ($bias_argument) = grep { /i2h_bias$/ } @{ $sym->list_arguments };
+    my $f = zeros(100);
+    my $expected_bias = $f->glue(0, $forget_bias * ones(100), zeros(200));
+    ok(
+        ((($mod->get_params())[0]->{$bias_argument}->aspdl - $expected_bias)->abs < 1e-07)->all
+    );
+}
+
+sub test_gru
+{
+    my $cell = mx->rnn->GRUCell(100, prefix=>'rnn_');
+    my($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort keys %{$cell->params->_params}], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
+    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+sub test_stack
+{
+    my $cell = mx->rnn->SequentialRNNCell();
+    for my $i (0..4)
+    {
+        $cell->add(mx->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_"));
+    }
+    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
+    $outputs = mx->sym->Group($outputs);
+    my %params = %{ $cell->params->_params };
+    for my $i (0..4)
+    {
+        ok(exists $params{"rnn_stack${i}_h2h_weight"});
+        ok(exists $params{"rnn_stack${i}_h2h_bias"});
+        ok(exists $params{"rnn_stack${i}_i2h_weight"});
+        ok(exists $params{"rnn_stack${i}_i2h_bias"});
+    }
+    is_deeply($outputs->list_outputs(), ['rnn_stack4_t0_out_output', 'rnn_stack4_t1_out_output', 'rnn_stack4_t2_out_output']);
+    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+sub test_bidirectional
+{
+    my $cell = mx->rnn->BidirectionalCell(
+        mx->rnn->LSTMCell(100, prefix=>'rnn_l0_'),
+        mx->rnn->LSTMCell(100, prefix=>'rnn_r0_'),
+        output_prefix=>'rnn_bi_'
+    );
+    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
+    $outputs = mx->sym->Group($outputs);
+    is_deeply($outputs->list_outputs(), ['rnn_bi_t0_output', 'rnn_bi_t1_output', 'rnn_bi_t2_output']);
+    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 200], [10, 200], [10, 200]]);
+}
+
+sub test_unfuse
+{
+    my $cell = mx->rnn->FusedRNNCell(
+        100, num_layers => 1, mode => 'lstm',
+        prefix => 'test_', bidirectional => 1
+    )->unfuse;
+    my ($outputs) = $cell->unroll(3, input_prefix=>'rnn_');
+    $outputs = mx->sym->Group($outputs);
+    is_deeply($outputs->list_outputs(), ['test_bi_lstm_0t0_output', 'test_bi_lstm_0t1_output', 'test_bi_lstm_0t2_output']);
+    my (undef, $outs, undef) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 200], [10, 200], [10, 200]]);
+}
+
+test_rnn();
+test_lstm();
+test_lstm_forget_bias();
+test_gru();
+test_stack();
+test_bidirectional();
+test_unfuse();
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_symbol.t b/perl-package/AI-MXNet/t/test_symbol.t
new file mode 100644
index 000000000000..190fa3caa2a4
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_symbol.t
@@ -0,0 +1,412 @@
+use strict;
+use warnings;
+use Test::More tests => 99;
+use AI::MXNet qw(mx);
+use AI::MXNet::TestUtils qw(mlp2 conv check_consistency zip assert enumerate);
+use Storable qw(freeze thaw);
+use PDL;
+
+sub test_symbol_compose
+{
+    my $data = mx->symbol->Variable('data');
+    my $net1 = mx->symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>10);
+    $net1 = mx->symbol->FullyConnected(data=>$net1, name=>'fc2', num_hidden=>100);
+    is_deeply($net1->list_arguments(), ['data',
+                              'fc1_weight', 'fc1_bias',
+                              'fc2_weight', 'fc2_bias']);
+
+    my $net2 = mx->symbol->FullyConnected(name=>'fc3', num_hidden=>10);
+    $net2 = mx->symbol->Activation(data=>$net2, act_type=>'relu');
+    $net2 = mx->symbol->FullyConnected(data=>$net2, name=>'fc4', num_hidden=>20);
+    my $composed = &{$net2}(fc3_data=>$net1, name=>'composed');
+    my $multi_out = mx->symbol->Group([$composed, $net1]);
+    ok(@{ $multi_out->list_outputs() } == 2);
+}
+
+test_symbol_compose();
+
+sub test_symbol_copy
+{
+    my $data = mx->symbol->Variable('data');
+    my $data_2 = $data->deepcopy;
+    is($data->tojson, $data_2->tojson);
+}
+
+test_symbol_compose();
+
+sub test_symbol_internal
+{
+    my $data = mx->symbol->Variable('data');
+    my $oldfc = mx->symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>10);
+    my $net1 = mx->symbol->FullyConnected(data=>$oldfc, name=>'fc2', num_hidden=>100);
+    is_deeply($net1->list_arguments, ['data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias']);
+
+    my $internal = $net1->get_internals();
+    my $fc1 = $internal->slice('fc1_output');
+    is_deeply($fc1->list_arguments, $oldfc->list_arguments);
+}
+
+test_symbol_internal();
+
+sub test_symbol_children
+{
+    my $data = mx->symbol->Variable('data');
+    my $oldfc = mx->symbol->FullyConnected(data=>$data, name=>'fc1', num_hidden=>10);
+    my $net1 = mx->symbol->FullyConnected(data=>$oldfc, name=>'fc2', num_hidden=>100);
+
+    is_deeply($net1->get_children()->list_outputs(), ['fc1_output', 'fc2_weight', 'fc2_bias']);
+    is_deeply($net1->get_children()->get_children()->list_outputs() , ['data', 'fc1_weight', 'fc1_bias']);
+    is_deeply($net1->get_children()->slice('fc2_weight')->list_arguments(), ['fc2_weight']);
+    ok(not defined $net1->get_children()->slice('fc2_weight')->get_children());
+
+    $data = mx->sym->Variable('data');
+    my $sliced = mx->sym->SliceChannel($data, num_outputs=>3, name=>'slice');
+    my $concat = mx->sym->Concat(@{ $sliced });
+
+    is_deeply($concat->get_children()->list_outputs(),
+        ['slice_output0', 'slice_output1', 'slice_output2']);
+    is_deeply($sliced->get_children()->list_outputs(), ['data']);
+}
+
+test_symbol_children();
+
+sub test_symbol_storable
+{
+    my $mlist = [mlp2(), conv()];
+    my $data = freeze($mlist);
+    my $mlist2 = thaw($data);
+    zip(sub {
+        my ($x, $y) = @_;
+        is($x->tojson, $y->tojson);
+    }, $mlist, $mlist2);
+}
+
+test_symbol_storable();
+
+sub test_symbol_saveload
+{
+    my $sym = mlp2();
+    my $fname = 'tmp_sym.json';
+    $sym->save($fname);
+    my $data2 = mx->symbol->load($fname);
+    # save because of order
+    is($sym->tojson, $data2->tojson);
+    unlink $fname;
+}
+
+test_symbol_saveload();
+
+sub test_symbol_infer_type
+{
+    my $data = mx->symbol->Variable('data');
+    my $f32data = mx->symbol->Cast(data=>$data, dtype=>'float32');
+    my $fc1 = mx->symbol->FullyConnected(data => $f32data, name=>'fc1', num_hidden=>128);
+    my $mlp = mx->symbol->SoftmaxOutput(data => $fc1, name => 'softmax');
+
+    my ($arg, $out, $aux) = $mlp->infer_type(data=>'float16');
+    is_deeply($arg, [qw/float16 float32 float32 float32/]);
+    is_deeply($out, ['float32']);
+    is_deeply($aux, []);
+}
+
+test_symbol_infer_type();
+
+sub test_symbol_infer_shape
+{
+    my $num_hidden = 128;
+    my $num_dim    = 64;
+    my $num_sample = 10;
+
+    my $data = mx->symbol->Variable('data');
+    my $prev = mx->symbol->Variable('prevstate');
+    my $x2h  = mx->symbol->FullyConnected(data=>$data, name=>'x2h', num_hidden=>$num_hidden);
+    my $h2h  = mx->symbol->FullyConnected(data=>$prev, name=>'h2h', num_hidden=>$num_hidden);
+
+    my $out  = mx->symbol->Activation(data=>mx->sym->elemwise_add($x2h, $h2h), name=>'out', act_type=>'relu');
+
+    # shape inference will fail because information is not available for h2h
+    my @ret  = $out->infer_shape(data=>[$num_sample, $num_dim]);
+    is_deeply(\@ret, [undef, undef, undef]);
+
+    my ($arg_shapes, $out_shapes, $aux_shapes) = $out->infer_shape_partial(data=>[$num_sample, $num_dim]);
+    my %arg_shapes;
+    @arg_shapes{ @{ $out->list_arguments } } = @{ $arg_shapes };
+    is_deeply($arg_shapes{data}, [$num_sample, $num_dim]);
+    is_deeply($arg_shapes{x2h_weight}, [$num_hidden, $num_dim]);
+    is_deeply($arg_shapes{h2h_weight}, []);
+
+    # now we can do full shape inference
+    my $state_shape = $out_shapes->[0];
+    ($arg_shapes, $out_shapes, $aux_shapes) = $out->infer_shape(data=>[$num_sample, $num_dim], prevstate=>$state_shape);
+    @arg_shapes{ @{ $out->list_arguments } } = @{ $arg_shapes };
+    is_deeply($arg_shapes{data}, [$num_sample, $num_dim]);
+    is_deeply($arg_shapes{x2h_weight}, [$num_hidden, $num_dim]);
+    is_deeply($arg_shapes{h2h_weight}, [$num_hidden, $num_hidden]);
+}
+
+test_symbol_infer_shape();
+
+sub test_symbol_infer_shape_var
+{
+    #Test specifying shape information when constructing a variable
+    my $shape = [2, 3];
+    my $a = mx->symbol->Variable('a', shape=>$shape);
+    my $b = mx->symbol->Variable('b');
+    my $c = mx->symbol->elemwise_add($a, $b);
+    my ($arg_shapes, $out_shapes, $aux_shapes) = $c->infer_shape();
+    is_deeply($arg_shapes->[0], $shape);
+    is_deeply($arg_shapes->[1], $shape);
+    is_deeply($out_shapes->[0], $shape);
+
+    $shape = [5, 6];
+    ($arg_shapes, $out_shapes, $aux_shapes) = $c->infer_shape(a=>$shape);
+    is_deeply($arg_shapes->[0], $shape);
+    is_deeply($arg_shapes->[1], $shape);
+    is_deeply($out_shapes->[0], $shape);
+}
+
+test_symbol_infer_shape_var();
+
+sub check_symbol_consistency
+{
+    my ($sym1, $sym2, $ctx) = @_;
+    is_deeply($sym1->list_arguments(), $sym2->list_arguments());
+    is_deeply($sym1->list_auxiliary_states(), $sym2->list_auxiliary_states());
+    is_deeply($sym1->list_outputs(), $sym2->list_outputs());
+    check_consistency(sym => [$sym1, $sym2], ctx_list => [$ctx, $ctx]);
+}
+
+sub test_load_000800
+{
+    my ($data, $weight, $fc1, $act1);
+    {
+        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage1');
+        $data = mx->symbol->Variable('data', lr_mult=>0.2);
+        $weight = mx->sym->Variable('fc1_weight', lr_mult=>1.2);
+        $fc1  = mx->symbol->FullyConnected(data => $data, weight=>$weight, name=>'fc1', num_hidden=>128, wd_mult=>0.3);
+        $act1 = mx->symbol->Activation(data => $fc1, name=>'relu1', act_type=>"relu");
+    }
+    my ($fc2, $act2, $fc3, $sym1);
+    {
+        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage2');
+        $fc2  = mx->symbol->FullyConnected(data => $act1, name => 'fc2', num_hidden => 64, lr_mult=>0.01);
+        $act2 = mx->symbol->Activation(data => $fc2, name=>'relu2', act_type=>"relu");
+        $fc3  = mx->symbol->FullyConnected(data => $act2, name=>'fc3', num_hidden=>10);
+        $fc3  = mx->symbol->BatchNorm($fc3, name=>'batchnorm0');
+        $sym1 = mx->symbol->SoftmaxOutput(data => $fc3, name => 'softmax')
+    }
+    { local $/ = undef; my $json = <DATA>; open(F, ">save_000800.json"); print F $json; close(F); };
+    my $sym2 = mx->sym->load('save_000800.json');
+    unlink 'save_000800.json';
+
+    my %attr1 = %{ $sym1->attr_dict };
+    my %attr2 = %{ $sym2->attr_dict };
+    while(my ($k, $v1) = each %attr1)
+    {
+        ok(exists $attr2{ $k });
+        my $v2 = $attr2{$k};
+        while(my ($kk, $vv1) = each %{ $v1 })
+        {
+            if($kk =~ /^__/ and $kk =~ /__$/)
+            {
+                ok(exists $v2->{$kk} and $v2->{$kk} eq $vv1);
+            }
+        }
+    }
+
+    check_symbol_consistency($sym1, $sym2,
+        {ctx => mx->cpu(0), group2ctx =>{stage1 => mx->cpu(1), stage2 => mx->cpu(2) }, shapes => { data => [1,200] }}
+    );
+}
+
+test_load_000800();
+
+__DATA__
+{
+  "nodes": [
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "data", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage1", 
+        "lr_mult": "0.2"
+      }
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "fc1_weight", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage1", 
+        "wd_mult": "0.3", 
+        "weight_lr_mult": "1.2"
+      }
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "fc1_bias", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage1", 
+        "wd_mult": "0.3", 
+        "weight_lr_mult": "1.2"
+      }
+    }, 
+    {
+      "op": "FullyConnected", 
+      "param": {
+        "no_bias": "False", 
+        "num_hidden": "128"
+      }, 
+      "name": "fc1", 
+      "inputs": [[0, 0], [1, 0], [2, 0]], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage1", 
+        "wd_mult": "0.3", 
+        "weight_lr_mult": "1.2"
+      }
+    }, 
+    {
+      "op": "Activation", 
+      "param": {"act_type": "relu"}, 
+      "name": "relu1", 
+      "inputs": [[3, 0]], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage1"}
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "fc2_weight", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage2", 
+        "lr_mult": "0.01"
+      }
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "fc2_bias", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage2", 
+        "lr_mult": "0.01"
+      }
+    }, 
+    {
+      "op": "FullyConnected", 
+      "param": {
+        "no_bias": "False", 
+        "num_hidden": "64"
+      }, 
+      "name": "fc2", 
+      "inputs": [[4, 0], [5, 0], [6, 0]], 
+      "backward_source_id": -1, 
+      "attr": {
+        "ctx_group": "stage2", 
+        "lr_mult": "0.01"
+      }
+    }, 
+    {
+      "op": "Activation", 
+      "param": {"act_type": "relu"}, 
+      "name": "relu2", 
+      "inputs": [[7, 0]], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "fc3_weight", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "fc3_bias", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "FullyConnected", 
+      "param": {
+        "no_bias": "False", 
+        "num_hidden": "10"
+      }, 
+      "name": "fc3", 
+      "inputs": [[8, 0], [9, 0], [10, 0]], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "batchnorm0_gamma", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "batchnorm0_beta", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "BatchNorm", 
+      "param": {
+        "eps": "0.001", 
+        "fix_gamma": "True", 
+        "momentum": "0.9", 
+        "use_global_stats": "False"
+      }, 
+      "name": "batchnorm0", 
+      "inputs": [[11, 0], [12, 0], [13, 0]], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "null", 
+      "param": {}, 
+      "name": "softmax_label", 
+      "inputs": [], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }, 
+    {
+      "op": "SoftmaxOutput", 
+      "param": {
+        "grad_scale": "1", 
+        "ignore_label": "-1", 
+        "multi_output": "False", 
+        "normalization": "null", 
+        "out_grad": "False", 
+        "preserve_shape": "False", 
+        "use_ignore": "False"
+      }, 
+      "name": "softmax", 
+      "inputs": [[14, 0], [15, 0]], 
+      "backward_source_id": -1, 
+      "attr": {"ctx_group": "stage2"}
+    }
+  ], 
+  "arg_nodes": [0, 1, 2, 5, 6, 9, 10, 12, 13, 15], 
+  "heads": [[16, 0]]
+}
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_viz.t b/perl-package/AI-MXNet/t/test_viz.t
new file mode 100644
index 000000000000..4cc2706c3c89
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_viz.t
@@ -0,0 +1,20 @@
+use AI::MXNet qw(mx);
+use Test::More tests => 1;
+
+sub test_print_summary
+{
+    my $data = mx->sym->Variable('data');
+    my $bias = mx->sym->Variable('fc1_bias', lr_mult => 1.0);
+    my $conv1= mx->sym->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
+    my $bn1  = mx->sym->BatchNorm(data => $conv1, name => "bn1");
+    my $act1 = mx->sym->Activation(data => $bn1, name => 'relu1', act_type => "relu");
+    my $mp1  = mx->sym->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride => [2,2], pool_type => 'max');
+    my $fc1  = mx->sym->FullyConnected(data => $mp1, bias => $bias, name => 'fc1', num_hidden => 10, lr_mult => 0);
+    my $fc2  = mx->sym->FullyConnected(data => $fc1, name => 'fc2', num_hidden => 10, wd_mult => 0.5);
+    mx->viz->print_summary($fc2);
+    my $shape = { data => [1,3,28,28] };
+    mx->viz->print_summary($fc2, $shape);
+}
+
+test_print_summary();
+ok(1);
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
new file mode 100644
index 000000000000..2a48a8ed8f3f
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/Changes
@@ -0,0 +1,15 @@
+Revision history for Perl extension AI::MXNetCAPI
+
+0.95  Sun Mar 26 17:42:02 PDT 2017
+        - visible on http://mxnet.io
+
+0.03  Sat Feb 25 13:21:07 PST 2017
+	- sync up with the Python interface.
+
+0.02  Tue Feb 14 07:08:37 PST 2017
+	- prepared for inclusion to the mxnet code repository.
+
+
+0.01  Fri Jan  6 19:40:53 2017
+	- original version
+
diff --git a/perl-package/AI-MXNetCAPI/MANIFEST b/perl-package/AI-MXNetCAPI/MANIFEST
new file mode 100644
index 000000000000..cea95def1d70
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/MANIFEST
@@ -0,0 +1,10 @@
+Changes
+Makefile.PL
+MANIFEST
+README
+META.json
+META.yml
+t/AI-MXNetCAPI.t
+lib/AI/MXNetCAPI.pm
+mxnet.i
+mxnet_typemaps.i
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
new file mode 100644
index 000000000000..2d195eed12d8
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/META.json
@@ -0,0 +1,41 @@
+{
+   "abstract" : "Swig interface to mxnet c api",
+   "author" : [
+      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
+   ],
+   "dynamic_config" : 0,
+   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
+   "license" : [
+      "apache_2_0"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : "2"
+   },
+   "name" : "AI-MXNetCAPI",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "Test::More" : "0"
+         }
+      }
+   },
+   "release_status" : "stable",
+   "version" : "0.95"
+}
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
new file mode 100644
index 000000000000..2f51ff8923d5
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -0,0 +1,22 @@
+---
+abstract: 'Swig interface to mxnet c api'
+author:
+  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
+build_requires:
+  ExtUtils::MakeMaker: '0'
+configure_requires:
+  ExtUtils::MakeMaker: '0'
+dynamic_config: 0
+generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
+license: apache
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: AI-MXNetCAPI
+no_index:
+  directory:
+    - t
+    - inc
+requires:
+  Test::More: '0'
+version: '0.95'
diff --git a/perl-package/AI-MXNetCAPI/Makefile.PL b/perl-package/AI-MXNetCAPI/Makefile.PL
new file mode 100644
index 000000000000..81b016a0925f
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/Makefile.PL
@@ -0,0 +1,42 @@
+use ExtUtils::MakeMaker;
+use DynaLoader;
+`swig -noproxy -c++ -perl mxnet.i`;
+unlink "MXNetCAPI.pm";
+my @tmp = split(/ /, $ExtUtils::MakeMaker::Config{lddlflags});
+my @lddlflags;
+while(my $flag = shift(@tmp))
+{
+   if($flag eq '-arch')
+   {
+      my $arch = shift(@tmp);
+      if($arch eq 'i386')
+      {
+         next;
+      }
+      else
+      {
+         push @lddlflags, ($flag, $arch);
+      }
+   }
+   else
+   {
+      push @lddlflags, $flag;
+   }
+}
+WriteMakefile(
+    NAME            => 'AI::MXNetCAPI',
+    LICENSE         => 'apache_2_0',
+    AUTHOR          => 'Sergey Kolychev <sergeykolychev.github@gmail.com>',
+    VERSION_FROM    => 'lib/AI/MXNetCAPI.pm',
+    ABSTRACT_FROM   => 'lib/AI/MXNetCAPI.pm',
+    LIBS           => ['-L../../lib -lmxnet'],
+    INC            => '-I../../include/mxnet',
+    OBJECT         => 'mxnet_wrap.o',
+    LDDLFLAGS      => join(' ', @lddlflags),
+    PREREQ_PM      => {
+        # prereqs
+        # build/test prereqs
+        'Test::More'   => 0,
+    },
+    PL_FILES        => {},
+);
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
new file mode 100644
index 000000000000..b2f963471101
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/README
@@ -0,0 +1,25 @@
+AI-MXNetCAPI version 0.95
+=====================
+
+Swig interface to MXNet c api.
+
+INSTALLATION
+
+To install this module type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+DEPENDENCIES
+
+This module requires mxnet http://mxnet.io
+It's used by AI::MXNet
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2017 by Sergey Kolychev <sergeykolychev.github@gmail.com>
+
+This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
+
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
new file mode 100644
index 000000000000..f9e09a0ae1be
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
@@ -0,0 +1,37 @@
+package AI::MXNetCAPI;
+use base qw(DynaLoader);
+bootstrap AI::MXNetCAPI;
+our $VERSION = '0.95';
+1;
+__END__
+
+=head1 NAME
+
+AI::MXNetCAPI - Swig interface to mxnet c api
+
+=head1 SYNOPSIS
+
+ use AI::MXNetCAPI;
+
+=head1 DESCRIPTION
+
+This module provides interface to mxnet
+via its api.
+
+=head1 SEE ALSO
+
+L<AI::MXNet>
+
+=head1 AUTHOR
+
+Sergey Kolychev, <sergeykolychev.github@gmail.com>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2017 Sergey Kolychev.
+
+This library is licensed under Apache 2.0 license.
+
+See https://www.apache.org/licenses/LICENSE-2.0 for more information.
+
+=cut
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
new file mode 100644
index 000000000000..af5a80c68a06
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -0,0 +1,1513 @@
+%module "AI::MXNetCAPI"
+%rename("%(strip:[MX])s") "";
+%include typemaps.i
+%include mxnet_typemaps.i
+%inline %{
+#include <c_api.h>
+
+// Taken as is from http://cpansearch.perl.org/src/COLEMINOR/Games-EternalLands-Binary-Float16-0.01/Float16.xs
+/* This method is faster than the OpenEXR implementation (very often
+ * used, eg. in Ogre), with the additional benefit of rounding, inspired
+ * by James Tursa's half-precision code. */
+static inline uint16_t _float_to_half(uint32_t x) {
+  uint16_t bits = (x >> 16) & 0x8000;
+  uint16_t m = (x >> 12) & 0x07ff;
+  unsigned int e = (x >> 23) & 0xff;
+  if (e < 103)
+    return bits;
+  if (e > 142) {
+    bits |= 0x7c00u;
+    bits |= e == 255 && (x & 0x007fffffu);
+    return bits;
+  }
+  if (e < 113) {
+    m |= 0x0800u;
+    bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
+    return bits;
+  }
+  bits |= ((e - 112) << 10) | (m >> 1);
+  bits += m & 1;
+  return bits;
+}
+
+static int const shifttable[32] = {
+  23, 14, 22, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0, 20, 0,
+  15, 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 17, 0, 18, 19, 0,
+};
+static uint32_t const shiftmagic = 0x07c4acddu;
+
+/* This algorithm is similar to the OpenEXR implementation, except it
+ * uses branchless code in the denormal path. This is slower than a
+ * table version, but will be more friendly to the cache for occasional
+ * uses. */
+static inline uint32_t _half_to_float(uint16_t x) {
+  uint32_t s = (x & 0x8000u) << 16;
+  if ((x & 0x7fffu) == 0)
+    return (uint32_t)x << 16;
+  uint32_t e = x & 0x7c00u;
+  uint32_t m = x & 0x03ffu;
+  if (e == 0) {
+    uint32_t v = m | (m >> 1);
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    e = shifttable[(v * shiftmagic) >> 27];
+    return s | (((125 - e) << 23) + (m << e));
+  }
+  if (e == 0x7c00u) {
+    if (m == 0)
+      return s | 0x7f800000u;
+    return s | 0x7fc00000u;
+  }
+  return s | (((e >> 10) + 112) << 23) | (m << 13);
+}
+
+union fbits {
+  float f;
+  uint32_t x;
+};
+
+static void KVStore_callback(int index, NDArrayHandle recv, NDArrayHandle local, void* callback)
+{
+    {
+        dSP;
+        PUSHMARK(SP);
+        XPUSHs(sv_2mortal(newSViv(index)));
+        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(recv), SWIGTYPE_p_MXNDArray, 0));
+        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(local), SWIGTYPE_p_MXNDArray, 0));
+        PUTBACK;
+        call_sv((SV*)callback, G_DISCARD);
+    }
+}
+
+static void KVStoreServer_callback(int head, const char *body, void* callback)
+{
+    {
+        dSP;
+        STRLEN len;
+        PUSHMARK(SP);
+        XPUSHs(sv_2mortal(newSViv(head)));
+        XPUSHs(sv_2mortal(newSVpv(body, len)));
+        PUTBACK;
+        call_sv((SV*)callback, G_DISCARD);
+    }
+}
+
+static void ExecutorMonitor_callback(const char* name, NDArrayHandle handle, void* callback)
+{
+    {
+        dSP;
+        STRLEN len;
+        PUSHMARK(SP);
+        XPUSHs(sv_2mortal(newSVpv(name, len)));
+        XPUSHs(SWIG_NewPointerObj(SWIG_as_voidptr(handle), SWIGTYPE_p_MXNDArray, 0));
+        PUTBACK;
+        call_sv((SV*)callback, G_DISCARD);
+    }
+}
+
+%} 
+
+%init %{
+    /* These SWIG_TypeClientData() calls might break in the future, but
+     * %rename should work on these types before that happens. */
+    SWIG_TypeClientData(SWIGTYPE_p_MXNDArray, (void *)"NDArrayHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXFunction, (void *)"FunctionHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXAtomicSymbolCreator, (void *)"AtomicSymbolCreator");
+    SWIG_TypeClientData(SWIGTYPE_p_MXSymbol, (void *)"SymbolHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXExecutor, (void *)"ExecutorHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXDataIterCreator, (void *)"DataIterCreator");
+    SWIG_TypeClientData(SWIGTYPE_p_MXDataIter, (void *)"DataIterHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXKVStore, (void *)"KVStoreHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXRecordIO, (void *)"RecordIOHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXRtc, (void *)"RtcHandle");
+%}
+
+/*! \brief manually define unsigned int */
+typedef unsigned int mx_uint;
+/*! \brief manually define float */
+typedef float mx_float;
+// all the handles are simply void *
+// will be casted internally to specific pointers types
+// these typedefs are mainly used for readablity reasons
+/*! \brief handle to NDArray */
+typedef MXNDArray *NDArrayHandle;
+/*! \brief handle to a mxnet narray function that changes NDArray */
+typedef MXFunction *FunctionHandle;
+/*! \brief handle to a function that takes param and creates symbol */
+typedef MXAtomicSymbolCreator *AtomicSymbolCreator;
+/*! \brief handle to a symbol that can be bind as operator */
+typedef MXSymbol *SymbolHandle;
+/*! \brief handle to a AtomicSymbol */
+typedef MXAtomicSymbol *AtomicSymbolHandle;
+/*! \brief handle to an Executor */
+typedef MXExecutor *ExecutorHandle;
+/*! \brief handle a dataiter creator */
+typedef MXDataIterCreator *DataIterCreator;
+/*! \brief handle to a DataIterator */
+typedef MXDataIter *DataIterHandle;
+/*! \brief handle to KVStore */
+typedef MXKVStore *KVStoreHandle;
+/*! \brief handle to RecordIO */
+typedef MXRecordIO *RecordIOHandle;
+/*! \brief handle to MXRtc*/
+typedef MXRtc *RtcHandle;
+
+typedef void (*ExecutorMonitorCallback)(const char*,
+                                                       NDArrayHandle,
+                                                       void *);
+struct NativeOpInfo {
+  void (*forward)(int, float**, int*, unsigned**, int*, void*);
+  void (*backward)(int, float**, int*, unsigned**, int*, void*);
+  void (*infer_shape)(int, int*, unsigned**, void*);
+  void (*list_outputs)(char***, void*);
+  void (*list_arguments)(char***, void*);
+  // all functions also pass a payload void* pointer
+  void* p_forward;
+  void* p_backward;
+  void* p_infer_shape;
+  void* p_list_outputs;
+  void* p_list_arguments;
+};
+
+struct NDArrayOpInfo {
+  bool (*forward)(int, void**, int*, void*);
+  bool (*backward)(int, void**, int*, void*);
+  bool (*infer_shape)(int, int*, unsigned**, void*);
+  bool (*list_outputs)(char***, void*);
+  bool (*list_arguments)(char***, void*);
+  bool (*declare_backward_dependency)(const int*, const int*, const int*,
+                                      int*, int**, void*);
+  // all functions also pass a payload void* pointer
+  void* p_forward;
+  void* p_backward;
+  void* p_infer_shape;
+  void* p_list_outputs;
+  void* p_list_arguments;
+  void* p_declare_backward_dependency;
+};
+
+/*!
+ * \brief return str message of the last error
+ *  all function in this file will return 0 when success
+ *  and -1 when an error occured,
+ *  MXGetLastError can be called to retrieve the error
+ *
+ *  this function is threadsafe and can be called by different thread
+ *  \return error info
+ */
+const char *MXGetLastError();
+
+//-------------------------------------
+// Part 0: Global State setups
+//-------------------------------------
+/*!
+ * \brief Seed the global random number generators in mxnet.
+ * \param seed the random number seed.
+ * \return 0 when success, -1 when failure happens.
+ */
+int MXRandomSeed(int seed);
+/*!
+ * \brief Notify the engine about a shutdown,
+ *  This can help engine to print less messages into display.
+ *
+ *  User do not have to call this function.
+ * \return 0 when success, -1 when failure happens.
+ */
+int MXNotifyShutdown();
+/*!
+ * \brief Set up configuration of profiler
+ * \param mode indicate the working mode of profiler,
+ *  record anly symbolic operator when mode == 0,
+ *  record all operator when mode == 1
+ * \param filename where to save trace file
+ * \return 0 when success, -1 when failure happens.
+ */
+int MXSetProfilerConfig(int mode, const char* filename);
+/*!
+ * \brief Set up state of profiler
+ * \param state indicate the working state of profiler,
+ *  profiler not running when state == 0,
+ *  profiler running when state == 1
+ * \return 0 when success, -1 when failure happens.
+ */
+int MXSetProfilerState(int state);
+
+/*! \brief Save profile and stop profiler */
+int MXDumpProfile();
+
+//-------------------------------------
+// Part 1: NDArray creation and deletion
+//-------------------------------------
+/*!
+ * \brief create a NDArray handle that is not initialized
+ *  can be used to pass in as mutate variables
+ *  to hold the result of NDArray
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayCreateNone(NDArrayHandle *out);
+/*!
+ * \brief create a NDArray with specified shape
+ * \param shape the pointer to the shape
+ * \param ndim the dimension of the shape
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ * \param delay_alloc whether to delay allocation until
+ *    the narray is first mutated
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayCreate(const mx_uint *in,
+                              mx_uint ndim,
+                              int dev_type,
+                              int dev_id,
+                              int delay_alloc,
+                              NDArrayHandle *out);
+
+/*!
+ * \brief create a NDArray with specified shape and data type
+ * \param shape the pointer to the shape
+ * \param ndim the dimension of the shape
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ * \param delay_alloc whether to delay allocation until
+ *    the narray is first mutated
+ * \param dtype data type of created array
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayCreateEx(const mx_uint *in,
+                              mx_uint ndim,
+                              int dev_type,
+                              int dev_id,
+                              int delay_alloc,
+                              int dtype,
+                              NDArrayHandle *out);
+/*!
+ * \brief create a NDArray handle that is loaded from raw bytes.
+ * \param buf the head of the raw bytes
+ * \param size size of the raw bytes
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayLoadFromRawBytes(const void *in,
+                                        size_t size,
+                                        NDArrayHandle *out);
+/*!
+ * \brief save the NDArray into raw bytes.
+ * \param handle the NDArray handle
+ * \param out_size size of the raw bytes
+ * \param out_buf the head of returning memory bytes.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArraySaveRawBytes(NDArrayHandle handle,
+                                    size_t *out_size,
+                                    const char **out_array);
+/*!
+ * \brief Save list of narray into the file.
+ * \param fname name of the file.
+ * \param num_args number of arguments to save.
+ * \param args the array of NDArrayHandles to be saved.
+ * \param keys the name of the NDArray, optional, can be NULL
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArraySave(const char* fname,
+                            mx_uint num_args,
+                            NDArrayHandle* in,
+                            const char** in);
+/*!
+ * \brief Load list of narray from the file.
+ * \param fname name of the file.
+ * \param out_size number of narray loaded.
+ * \param out_arr head of the returning narray handles.
+ * \param out_name_size size of output name arrray.
+ * \param out_names the names of returning NDArrays, can be NULL
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayLoad(const char* fname,
+                            mx_uint *out_size,
+                            NDArrayHandle** out_array,
+                            mx_uint *out_size,
+                            const char*** out_array);
+/*!
+ * \brief Perform a synchronize copy from a continugous CPU memory region.
+ *
+ *  This function will call WaitToWrite before the copy is performed.
+ *  This is useful to copy data from existing memory region that are
+ *  not wrapped by NDArray(thus dependency not being tracked).
+ *
+ * \param handle the NDArray handle
+ * \param data the data source to copy from.
+ * \param size the memory size we want to copy from.
+ */
+int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
+                                       const void *in,
+                                       size_t size);
+/*!
+ * \brief Perform a synchronize copy to a continugous CPU memory region.
+ *
+ *  This function will call WaitToRead before the copy is performed.
+ *  This is useful to copy data from existing memory region that are
+ *  not wrapped by NDArray(thus dependency not being tracked).
+ *
+ * \param handle the NDArray handle
+ * \param data the data source to copy into.
+ * \param size the memory size we want to copy into.
+ */
+int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
+                                     void *in,
+                                     size_t size);
+/*!
+ * \brief Wait until all the pending writes with respect NDArray are finished.
+ *  Always call this before read data out synchronizely.
+ * \param handle the NDArray handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayWaitToRead(NDArrayHandle handle);
+/*!
+ * \brief Wait until all the pending read/write with respect NDArray are finished.
+ *  Always call this before write data into NDArray synchronizely.
+ * \param handle the NDArray handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayWaitToWrite(NDArrayHandle handle);
+/*!
+ * \brief wait until all delayed operations in
+ *   the system is completed
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayWaitAll();
+/*!
+ * \brief free the narray handle
+ * \param handle the handle to be freed
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayFree(NDArrayHandle handle);
+/*!
+ * \brief Slice the NDArray along axis 0.
+ * \param handle the handle to the NDArray
+ * \param slice_begin The beginning index of slice
+ * \param slice_end The ending index of slice
+ * \param out The NDArrayHandle of sliced NDArray
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArraySlice(NDArrayHandle handle,
+                             mx_uint slice_begin,
+                             mx_uint slice_end,
+                             NDArrayHandle *out);
+/*!
+ * \brief Index the NDArray along axis 0.
+ * \param handle the handle to the NDArray
+ * \param idx the index
+ * \param out The NDArrayHandle of output NDArray
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayAt(NDArrayHandle handle,
+                          mx_uint idx,
+                          NDArrayHandle *out);
+/*!
+ * \brief Reshape the NDArray.
+ * \param handle the handle to the narray
+ * \param ndim number of dimensions of new shape
+ * \param dims new shape
+ * \param out the NDArrayHandle of reshaped NDArray
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayReshape(NDArrayHandle handle,
+                               int ndim,
+                               int *in,
+                               NDArrayHandle *out);
+/*!
+ * \brief get the shape of the array
+ * \param handle the handle to the narray
+ * \param out_dim the output dimension
+ * \param out_pdata pointer holder to get data pointer of the shape
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayGetShape(NDArrayHandle handle,
+                                mx_uint *out_dim,
+                                const mx_uint **out_pdata);
+/*!
+ * \brief get the content of the data in NDArray
+ * \param handle the handle to the narray
+ * \param out_pdata pointer holder to get pointer of data
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayGetData(NDArrayHandle handle,
+                                void **out_pdata);
+/*!
+ * \brief get the type of the data in NDArray
+ * \param handle the handle to the narray
+ * \param out_dtype pointer holder to get type of data
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayGetDType(NDArrayHandle handle,
+                               int *out);
+/*!
+ * \brief get the context of the NDArray
+ * \param handle the handle to the narray
+ * \param out_dev_type the output device type
+ * \param out_dev_id the output device id
+ * \return 0 when success, -1 when failure happens
+ */
+int MXNDArrayGetContext(NDArrayHandle handle,
+                                  int *out,
+                                  int *out);
+
+//--------------------------------
+// Part 2: functions on NDArray
+//--------------------------------
+/*!
+ * \brief list all the available functions handles
+ *   most user can use it to list all the needed functions
+ * \param out_size the size of returned array
+ * \param out_array the output function array
+ * \return 0 when success, -1 when failure happens
+ */
+int MXListFunctions(mx_uint *out_size,
+                              FunctionHandle **out_array);
+/*!
+ * \brief get the function handle by name
+ * \param name the name of the function
+ * \param out the corresponding function handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXGetFunction(const char *name,
+                            FunctionHandle *out);
+/*!
+ * \brief Get the information of the function handle.
+ * \param fun The function handle.
+ * \param name The returned name of the function.
+ * \param description The returned description of the function.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type information about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \param return_type Return type of the function.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXFuncGetInfo(FunctionHandle fun,
+                            const char **name,
+                            const char **description,
+                            mx_uint *num_args,
+                            const char ***arg_names,
+                            const char ***arg_type_infos,
+                            const char ***arg_descriptions
+                            );
+/*!
+ * \brief get the argument requirements of the function
+ * \param fun input function handle
+ * \param num_use_vars how many NDArrays to be passed in as used_vars
+ * \param num_scalars scalar variable is needed
+ * \param num_mutate_vars how many NDArrays to be passed in as mutate_vars
+ * \param type_mask the type mask of this function
+ * \return 0 when success, -1 when failure happens
+ * \sa MXFuncInvoke
+ */
+int MXFuncDescribe(FunctionHandle fun,
+                             mx_uint *out,
+                             mx_uint *out,
+                             mx_uint *out,
+                             int *out);
+/*!
+ * \brief invoke a function, the array size of passed in arguments
+ *   must match the values in the
+ * \param fun the function
+ * \param use_vars the normal arguments passed to function
+ * \param scalar_args the scalar qarguments
+ * \param mutate_vars the mutate arguments
+ * \return 0 when success, -1 when failure happens
+ * \sa MXFuncDescribeArgs
+ */
+int MXFuncInvoke(FunctionHandle fun,
+                           NDArrayHandle *in,
+                           mx_float *in,
+                           NDArrayHandle *in);
+/*!
+ * \brief invoke a function, the array size of passed in arguments
+ *   must match the values in the
+ * \param fun the function
+ * \param use_vars the normal arguments passed to function
+ * \param scalar_args the scalar qarguments
+ * \param mutate_vars the mutate arguments
+ * \param num_params number of keyword parameters
+ * \param param_keys keys for keyword parameters
+ * \param param_vals values for keyword parameters
+ * \return 0 when success, -1 when failure happens
+ * \sa MXFuncDescribeArgs
+ */
+int MXFuncInvokeEx(FunctionHandle fun,
+                             NDArrayHandle *in,
+                             mx_float *in,
+                             NDArrayHandle *in,
+                             int num_params,
+                             char **keys,
+                             char **vals);
+/*!
+ * \brief invoke a nnvm op and imperative function
+ * \param creator the op
+ * \param num_inputs number of input NDArrays
+ * \param inputs input NDArrays
+ * \param num_outputs number of output NDArrays
+ * \param outputs output NDArrays
+ * \param num_params number of keyword parameters
+ * \param param_keys keys for keyword parameters
+ * \param param_vals values for keyword parameters
+ * \return 0 when success, -1 when failure happens
+ */
+int MXImperativeInvoke(AtomicSymbolCreator in,
+                                 int num_inputs,
+                                 NDArrayHandle *in,
+                                 int *out_size,
+                                 NDArrayHandle **out_array,
+                                 int num_params,
+                                 const char **keys,
+                                 const char **vals);
+
+//--------------------------------------------
+// Part 3: symbolic configuration generation
+//--------------------------------------------
+/*!
+ * \brief list all the available operator names, include entries.
+ * \param out_size the size of returned array
+ * \param out_array the output operator name array.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXListAllOpNames(mx_uint *out_size,
+                               const char ***out_array);
+/*!
+ * \brief list all the available AtomicSymbolEntry
+ * \param out_size the size of returned array
+ * \param out_array the output AtomicSymbolCreator array
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolListAtomicSymbolCreators(mx_uint *out_size,
+                                               AtomicSymbolCreator **out_array);
+
+/*!
+ * \brief Get the name of an atomic symbol.
+ * \param creator the AtomicSymbolCreator.
+ * \param name The returned name of the creator.
+ */
+int MXSymbolGetAtomicSymbolName(AtomicSymbolCreator in,
+                                          const char **out);
+/*!
+ * \brief Get the detailed information about atomic symbol.
+ * \param creator the AtomicSymbolCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \param key_var_num_args The keyword argument for specifying variable number of arguments.
+ *            When this parameter has non-zero length, the function allows variable number
+ *            of positional arguments, and will need the caller to pass it in in
+ *            MXSymbolCreateAtomicSymbol,
+ *            With key = key_var_num_args, and value = number of positional arguments.
+ * \param return_type Return type of the function, can be Symbol or Symbol[]
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGetAtomicSymbolInfo(AtomicSymbolCreator in,
+                                          const char **name,
+                                          const char **description,
+                                          mx_uint *num_args,
+                                          const char ***arg_names,
+                                          const char ***arg_type_infos,
+                                          const char ***arg_descriptions,
+                                          const char **key_var_num_args
+                                          );
+/*!
+ * \brief Create an AtomicSymbol.
+ * \param creator the AtomicSymbolCreator
+ * \param num_param the number of parameters
+ * \param keys the keys to the params
+ * \param vals the vals of the params
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCreateAtomicSymbol(AtomicSymbolCreator in,
+                                         mx_uint num_param,
+                                         const char **keys,
+                                         const char **vals,
+                                         SymbolHandle *out);
+/*!
+ * \brief Create a Variable Symbol.
+ * \param name name of the variable
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCreateVariable(const char *name, SymbolHandle *out);
+/*!
+ * \brief Create a Symbol by grouping list of symbols together
+ * \param num_symbols number of symbols to be grouped
+ * \param symbols array of symbol handles
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCreateGroup(mx_uint num_symbols,
+                                  SymbolHandle *in,
+                                  SymbolHandle *out);
+/*!
+ * \brief Load a symbol from a json file.
+ * \param fname the file name.
+ * \param out the output symbol.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCreateFromFile(const char *fname, SymbolHandle *out);
+/*!
+ * \brief Load a symbol from a json string.
+ * \param json the json string.
+ * \param out the output symbol.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCreateFromJSON(const char *json, SymbolHandle *out);
+/*!
+ * \brief Save a symbol into a json file.
+ * \param symbol the input symbol.
+ * \param fname the file name.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolSaveToFile(SymbolHandle symbol, const char *fname);
+/*!
+ * \brief Save a symbol into a json string
+ * \param symbol the input symbol.
+ * \param out_json output json string.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolSaveToJSON(SymbolHandle symbol, const char **out);
+/*!
+ * \brief Free the symbol handle.
+ * \param symbol the symbol
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolFree(SymbolHandle symbol);
+/*!
+ * \brief Copy the symbol to another handle
+ * \param symbol the source symbol
+ * \param out used to hold the result of copy
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCopy(SymbolHandle symbol, SymbolHandle *out);
+/*!
+ * \brief Print the content of symbol, used for debug.
+ * \param symbol the symbol
+ * \param out_str pointer to hold the output string of the printing.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolPrint(SymbolHandle symbol, const char **out);
+/*!
+ * \brief Get string name from symbol
+ * \param symbol the source symbol
+ * \param out The result name.
+ * \param success Whether the result is contained in out.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGetName(SymbolHandle symbol,
+                              const char** out,
+                              int *out);
+/*!
+ * \brief Get string attribute from symbol
+ * \param symbol the source symbol
+ * \param key The key of the symbol.
+ * \param out The result attribute, can be NULL if the attribute do not exist.
+ * \param success Whether the result is contained in out.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGetAttr(SymbolHandle symbol,
+                              const char* key,
+                              const char** out,
+                              int *out);
+/*!
+ * \brief Set string attribute from symbol.
+ *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic graph.
+ *
+ *  Safe recommendaton: use  immutable graph
+ *  - Only allow set attributes during creation of new symbol as optional parameter
+ *
+ *  Mutable graph (be careful about the semantics):
+ *  - Allow set attr at any point.
+ *  - Mutating an attribute of some common node of two graphs can cause confusion from user.
+ *
+ * \param symbol the source symbol
+ * \param key The key of the symbol.
+ * \param value The value to be saved.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolSetAttr(SymbolHandle symbol,
+                              const char* in,
+                              const char* in);
+/*!
+ * \brief Get all attributes from symbol, including all descendents.
+ * \param symbol the source symbol
+ * \param out_size The number of output attributes
+ * \param out 2*out_size strings representing key value pairs.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolListAttr(SymbolHandle symbol,
+                               mx_uint *out_size,
+                               const char*** out_array2);
+/*!
+ * \brief Get all attributes from symbol, excluding descendents.
+ * \param symbol the source symbol
+ * \param out_size The number of output attributes
+ * \param out 2*out_size strings representing key value pairs.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolListAttrShallow(SymbolHandle symbol,
+                                      mx_uint *out_size,
+                                      const char*** out_array2);
+/*!
+ * \brief List arguments in the symbol.
+ * \param symbol the symbol
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolListArguments(SymbolHandle symbol,
+                                    mx_uint *out_size,
+                                    const char ***out_array);
+/*!
+ * \brief List returns in the symbol.
+ * \param symbol the symbol
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolListOutputs(SymbolHandle symbol,
+                                  mx_uint *out_size,
+                                  const char ***out_array);
+/*!
+ * \brief Get a symbol that contains all the internals.
+ * \param symbol The symbol
+ * \param out The output symbol whose outputs are all the internals.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGetInternals(SymbolHandle symbol,
+                                   SymbolHandle *out);
+/*!
+ * \brief Get a symbol that contains only direct children.
+ * \param symbol The symbol
+ * \param out The output symbol whose outputs are the direct children.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGetChildren(SymbolHandle symbol,
+                                  SymbolHandle *out);
+/*!
+ * \brief Get index-th outputs of the symbol.
+ * \param symbol The symbol
+ * \param index the Index of the output.
+ * \param out The output symbol whose outputs are the index-th symbol.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGetOutput(SymbolHandle symbol,
+                                mx_uint index,
+                                SymbolHandle *out);
+/*!
+ * \brief List auxiliary states in the symbol.
+ * \param symbol the symbol
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolListAuxiliaryStates(SymbolHandle symbol,
+                                          mx_uint *out_size,
+                                          const char ***out_array);
+/*!
+ * \brief Compose the symbol on other symbols.
+ *
+ *  This function will change the sym hanlde.
+ *  To achieve function apply behavior, copy the symbol first
+ *  before apply.
+ *
+ * \param sym the symbol to apply
+ * \param name the name of symbol
+ * \param num_args number of arguments
+ * \param keys the key of keyword args (optional)
+ * \param args arguments to sym
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolCompose(SymbolHandle sym,
+                              const char *name,
+                              mx_uint num_args,
+                              const char** in,
+                              SymbolHandle* in);
+/*!
+ * \brief Get the gradient graph of the symbol
+ *
+ * \param sym the symbol to get gradient
+ * \param num_wrt number of arguments to get gradient
+ * \param wrt the name of the arguments to get gradient
+ * \param out the returned symbol that has gradient
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolGrad(SymbolHandle sym,
+                           mx_uint num_wrt,
+                           const char** in,
+                           SymbolHandle* out);
+/*!
+ * \brief infer shape of unknown input shapes given the known one.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param aux_shape_size sizeof the returning array of aux_shapes
+ * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
+ * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolInferShape(SymbolHandle sym,
+                                 mx_uint num_args,
+                                 const char** in,
+                                 const mx_uint *in,
+                                 const mx_uint *in,
+                                 mx_uint *in_shape_size,
+                                 const mx_uint **in_shape_ndim,
+                                 const mx_uint ***in_shape_data,
+                                 mx_uint *out_shape_size,
+                                 const mx_uint **out_shape_ndim,
+                                 const mx_uint ***out_shape_data,
+                                 mx_uint *aux_shape_size,
+                                 const mx_uint **aux_shape_ndim,
+                                 const mx_uint ***aux_shape_data,
+                                 int *out);
+/*!
+ * \brief partially infer shape of unknown input shapes given the known one.
+ *
+ *  Return partially inferred results if not all shapes could be inferred.
+ *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_ind_ptr the head pointer of the rows in CSR
+ * \param arg_shape_data the content of the CSR
+ * \param in_shape_size sizeof the returning array of in_shapes
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_data returning array of pointers to head of the input shape.
+ * \param out_shape_size sizeof the returning array of out_shapes
+ * \param out_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param out_shape_data returning array of pointers to head of the input shape.
+ * \param aux_shape_size sizeof the returning array of aux_shapes
+ * \param aux_shape_ndim returning array of shape dimensions of eachs auxiliary shape.
+ * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
+ * \param complete whether infer shape completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolInferShapePartial(SymbolHandle sym,
+                                 mx_uint num_args,
+                                 const char** in,
+                                 const mx_uint *in,
+                                 const mx_uint *in,
+                                 mx_uint *in_shape_size,
+                                 const mx_uint **in_shape_ndim,
+                                 const mx_uint ***in_shape_data,
+                                 mx_uint *out_shape_size,
+                                 const mx_uint **out_shape_ndim,
+                                 const mx_uint ***out_shape_data,
+                                 mx_uint *aux_shape_size,
+                                 const mx_uint **aux_shape_ndim,
+                                 const mx_uint ***aux_shape_data,
+                                 int *out);
+
+/*!
+ * \brief infer type of unknown input types given the known one.
+ *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
+ *  The call will be treated as a kwargs call if key != nullptr or num_args==0, otherwise it is positional.
+ *
+ * \param sym symbol handle
+ * \param num_args numbe of input arguments.
+ * \param keys the key of keyword args (optional)
+ * \param arg_type_data the content of the CSR
+ * \param in_type_size sizeof the returning array of in_types
+ * \param in_type_data returning array of pointers to head of the input type.
+ * \param out_type_size sizeof the returning array of out_types
+ * \param out_type_data returning array of pointers to head of the input type.
+ * \param aux_type_size sizeof the returning array of aux_types
+ * \param aux_type_data returning array of pointers to head of the auxiliary type.
+ * \param complete whether infer type completes or more information is needed.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXSymbolInferType(SymbolHandle sym,
+                                mx_uint num_args,
+                                const char** in,
+                                const int *in,
+                                mx_uint *in_type_size,
+                                const int **in_type_data,
+                                mx_uint *out_type_size,
+                                const int **out_type_data,
+                                mx_uint *aux_type_size,
+                                const int **aux_type_data,
+                                int *out);
+//--------------------------------------------
+// Part 4: Executor interface
+//--------------------------------------------
+/*!
+ * \brief Delete the executor
+ * \param handle the executor.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorFree(ExecutorHandle handle);
+/*!
+ * \brief Print the content of execution plan, used for debug.
+ * \param handle the executor.
+ * \param out_str pointer to hold the output string of the printing.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorPrint(ExecutorHandle handle, const char **out);
+/*!
+ * \brief Executor forward method
+ *
+ * \param handle executor handle
+ * \param is_train bool value to indicate whether the forward pass is for evaluation
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorForward(ExecutorHandle handle, int is_train);
+/*!
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NDArray handle for heads' gradient
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorBackward(ExecutorHandle handle,
+                                 mx_uint len,
+                                 NDArrayHandle *in);
+
+/*!
+ * \brief Get executor's head NDArray
+ *
+ * \param handle executor handle
+ * \param out_size output narray vector size
+ * \param out out put narray handles
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorOutputs(ExecutorHandle handle,
+                                mx_uint *out_size,
+                                NDArrayHandle **out_array);
+
+/*!
+ * \brief Generate Executor from symbol
+ *
+ * \param symbol_handle symbol handle
+ * \param dev_type device type
+ * \param dev_id device id
+ * \param len length
+ * \param in_args in args array
+ * \param arg_grad_store arg grads handle array
+ * \param grad_req_type grad req array
+ * \param aux_states_len length of auxiliary states
+ * \param aux_states auxiliary states array
+ * \param out output executor handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorBind(SymbolHandle symbol_handle,
+                             int dev_type,
+                             int dev_id,
+                             mx_uint len,
+                             NDArrayHandle *in,
+                             NDArrayHandle *in,
+                             mx_uint *in,
+                             mx_uint aux_states_len,
+                             NDArrayHandle *in,
+                             ExecutorHandle *out);
+/*!
+ * \brief Generate Executor from symbol,
+ *  This is advanced function, allow specify group2ctx map.
+ *  The user can annotate "ctx_group" attribute to name each group.
+ *
+ * \param symbol_handle symbol handle
+ * \param dev_type device type of default context
+ * \param dev_id device id of default context
+ * \param num_map_keys size of group2ctx map
+ * \param map_keys keys of group2ctx map
+ * \param map_dev_types device type of group2ctx map
+ * \param map_dev_ids device id of group2ctx map
+ * \param len length
+ * \param in_args in args array
+ * \param arg_grad_store arg grads handle array
+ * \param grad_req_type grad req array
+ * \param aux_states_len length of auxiliary states
+ * \param aux_states auxiliary states array
+ * \param out output executor handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorBindX(SymbolHandle symbol_handle,
+                              int dev_type,
+                              int dev_id,
+                              mx_uint num_map_keys,
+                              const char** in,
+                              const int* in,
+                              const int* in,
+                              mx_uint len,
+                              NDArrayHandle *in,
+                              NDArrayHandle *in,
+                              mx_uint *in,
+                              mx_uint aux_states_len,
+                              NDArrayHandle *in,
+                              ExecutorHandle *out);
+/*!
+ * \brief Generate Executor from symbol,
+ *  This is advanced function, allow specify group2ctx map.
+ *  The user can annotate "ctx_group" attribute to name each group.
+ *
+ * \param symbol_handle symbol handle
+ * \param dev_type device type of default context
+ * \param dev_id device id of default context
+ * \param num_map_keys size of group2ctx map
+ * \param map_keys keys of group2ctx map
+ * \param map_dev_types device type of group2ctx map
+ * \param map_dev_ids device id of group2ctx map
+ * \param len length
+ * \param in_args in args array
+ * \param arg_grad_store arg grads handle array
+ * \param grad_req_type grad req array
+ * \param aux_states_len length of auxiliary states
+ * \param aux_states auxiliary states array
+ * \param shared_exec input executor handle for memory sharing
+ * \param out output executor handle
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorBindEX(SymbolHandle symbol_handle,
+                               int dev_type,
+                               int dev_id,
+                               mx_uint num_map_keys,
+                               const char** in,
+                               const int* in,
+                               const int* in,
+                               mx_uint len,
+                               NDArrayHandle *in,
+                               NDArrayHandle *in,
+                               mx_uint *in,
+                               mx_uint aux_states_len,
+                               NDArrayHandle *in,
+                               ExecutorHandle shared_exec,
+                               ExecutorHandle *out);
+/*!
+ * \brief set a call back to notify the completion of operation
+ */
+int MXExecutorSetMonitorCallback(ExecutorHandle handle,
+                                           ExecutorMonitorCallback callback,
+                                           void* callback_handle);
+//--------------------------------------------
+// Part 5: IO Interface
+//--------------------------------------------
+/*!
+ * \brief List all the available iterator entries
+ * \param out_size the size of returned iterators
+ * \param out_array the output iteratos entries
+ * \return 0 when success, -1 when failure happens
+ */
+int MXListDataIters(mx_uint *out_size,
+                              DataIterCreator **out_array);
+/*!
+ * \brief Init an iterator, init with parameters
+ * the array size of passed in arguments
+ * \param handle of the iterator creator
+ * \param num_param number of parameter
+ * \param keys parameter keys
+ * \param vals parameter values
+ * \param out resulting iterator
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterCreateIter(DataIterCreator handle,
+                                   mx_uint num_param,
+                                   const char **keys,
+                                   const char **vals,
+                                   DataIterHandle *out);
+/*!
+ * \brief Get the detailed information about data iterator.
+ * \param creator the DataIterCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterGetIterInfo(DataIterCreator creator,
+                                    const char **name,
+                                    const char **description,
+                                    mx_uint *num_args,
+                                    const char ***arg_names,
+                                    const char ***arg_type_infos,
+                                    const char ***arg_descriptions);
+/*!
+ * \brief Free the handle to the IO module
+ * \param handle the handle pointer to the data iterator
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterFree(DataIterHandle handle);
+/*!
+ * \brief Move iterator to next position
+ * \param handle the handle to iterator
+ * \param out return value of next
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterNext(DataIterHandle handle,
+                             int *out);
+/*!
+ * \brief Call iterator.Reset
+ * \param handle the handle to iterator
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterBeforeFirst(DataIterHandle handle);
+
+/*!
+ * \brief Get the handle to the NDArray of underlying data
+ * \param handle the handle pointer to the data iterator
+ * \param out handle to underlying data NDArray
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterGetData(DataIterHandle handle,
+                                NDArrayHandle *out);
+/*!
+ * \brief Get the image index by array.
+ * \param handle the handle pointer to the data iterator
+ * \param out_index output index of the array.
+ * \param out_size output size of the array.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterGetIndex(DataIterHandle handle,
+                                 uint64_t **out_index,
+                                 uint64_t *out_size);
+/*!
+ * \brief Get the padding number in current data batch
+ * \param handle the handle pointer to the data iterator
+ * \param pad pad number ptr
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterGetPadNum(DataIterHandle handle,
+                                  int *out);
+
+/*!
+ * \brief Get the handle to the NDArray of underlying label
+ * \param handle the handle pointer to the data iterator
+ * \param out the handle to underlying label NDArray
+ * \return 0 when success, -1 when failure happens
+ */
+int MXDataIterGetLabel(DataIterHandle handle,
+                                 NDArrayHandle *out);
+//--------------------------------------------
+// Part 6: basic KVStore interface
+//--------------------------------------------
+/*!
+ * \brief Initialized ps-lite environment variables
+ * \param num_vars number of variables to initialize
+ * \param keys environment keys
+ * \param vals environment values
+ */
+int MXInitPSEnv(mx_uint num_vars,
+                          const char **keys,
+                          const char **vals);
+
+
+/*!
+ * \brief Create a kvstore
+ * \param type the type of KVStore
+ * \param out The output type of KVStore
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreCreate(const char *type,
+                              KVStoreHandle *out);
+/*!
+ * \brief Delete a KVStore handle.
+ * \param handle handle to the kvstore
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreFree(KVStoreHandle handle);
+/*!
+ * \brief Init a list of (key,value) pairs in kvstore
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreInit(KVStoreHandle handle,
+                            mx_uint num,
+                            const int* in,
+                            NDArrayHandle* in);
+
+/*!
+ * \brief Push a list of (key,value) pairs to kvstore
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \param priority the priority of the action
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStorePush(KVStoreHandle handle,
+                            mx_uint num,
+                            const int* in,
+                            NDArrayHandle* in,
+                            int priority);
+/*!
+ * \brief pull a list of (key, value) pairs from the kvstore
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \param priority the priority of the action
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStorePull(KVStoreHandle handle,
+                            mx_uint num,
+                            const int* in,
+                            NDArrayHandle* in,
+                            int priority);
+/*!
+ * \brief user-defined updater for the kvstore
+ * It's this updater's responsibility to delete \a recv and \a local
+ * \param the key
+ * \param recv the pushed value on this key
+ * \param local the value stored on local on this key
+ * \param handle The additional handle to the updater
+ */
+typedef void (MXKVStoreUpdater)(int key,
+                                NDArrayHandle recv,
+                                NDArrayHandle local,
+                                void *handle);
+/*!
+ * \brief register an push updater
+ * \param handle handle to the KVStore
+ * \param updater udpater function
+ * \param updater_handle The additional handle used to invoke the updater
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreSetUpdater(KVStoreHandle handle,
+                                  MXKVStoreUpdater updater,
+                                  void *callback_handle);
+/*!
+ * \brief get the type of the kvstore
+ * \param handle handle to the KVStore
+ * \param type a string type
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreGetType(KVStoreHandle handle,
+                               const char** out);
+//--------------------------------------------
+// Part 6: advanced KVStore for multi-machines
+//--------------------------------------------
+
+/**
+ * \brief return The rank of this node in its group, which is in [0, GroupSize).
+ *
+ * \param handle handle to the KVStore
+ * \param ret the node rank
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreGetRank(KVStoreHandle handle,
+                               int *out);
+
+/**
+ * \brief return The number of nodes in this group, which is
+ * - number of workers if if `IsWorkerNode() == true`,
+ * - number of servers if if `IsServerNode() == true`,
+ * - 1 if `IsSchedulerNode() == true`,
+ * \param handle handle to the KVStore
+ * \param ret the group size
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreGetGroupSize(KVStoreHandle handle,
+                                    int *out);
+
+/**
+ * \brief return whether or not this process is a worker node.
+ * \param ret 1 for yes, 0 for no
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreIsWorkerNode(int *out);
+
+
+/**
+ * \brief return whether or not this process is a server node.
+ * \param ret 1 for yes, 0 for no
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreIsServerNode(int *out);
+
+
+/**
+ * \brief return whether or not this process is a scheduler node.
+ * \param ret 1 for yes, 0 for no
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreIsSchedulerNode(int *out);
+
+/**
+ * \brief global barrier among all worker machines
+ *
+ * \param handle handle to the KVStore
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreBarrier(KVStoreHandle handle);
+
+/**
+ * \brief whether to do barrier when finalize
+ *
+ * \param handle handle to the KVStore
+ * \param barrier_before_exit whether to do barrier when kvstore finalize
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreSetBarrierBeforeExit(KVStoreHandle handle,
+                                            const int barrier_before_exit);
+
+/**
+ * \brief the prototype of a server controller
+ * \param head the head of the command
+ * \param body the body of the command
+ * \param controller_handle helper handle for implementing controller
+ */
+typedef void (MXKVStoreServerController)(int head,
+                                         const char *body,
+                                         void *controller_handle);
+
+/**
+ * \return Run as server (or scheduler)
+ *
+ * \param handle handle to the KVStore
+ * \param controller the user-defined server controller
+ * \param controller_handle helper handle for implementing controller
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreRunServer(KVStoreHandle handle,
+                                 MXKVStoreServerController controller,
+                                 void *callback_handle);
+
+/**
+ * \return Send a command to all server nodes
+ *
+ * \param handle handle to the KVStore
+ * \param cmd_id the head of the command
+ * \param cmd_body the body of the command
+ * \return 0 when success, -1 when failure happens
+ */
+int MXKVStoreSendCommmandToServers(KVStoreHandle handle,
+                                             int cmd_id,
+                                             const char* cmd_body);
+
+/**
+ * \brief Get the number of ps dead node(s) specified by {node_id}
+ *
+ * \param handle handle to the KVStore
+ * \param node_id Can be a node group or a single node.
+ *                kScheduler = 1, kServerGroup = 2, kWorkerGroup = 4
+ * \param number Ouptut number of dead nodes
+ * \param timeout_sec A node fails to send heartbeart in {timeout_sec} seconds
+ *                    will be presumed as 'dead'
+ */
+int MXKVStoreGetNumDeadNode(KVStoreHandle handle,
+                                      const int node_id,
+                                      int *out,
+                                      const int timeout_sec = 60);
+
+/**
+ * \brief Create a RecordIO writer object
+ * \param uri path to file
+ * \param out handle pointer to the created object
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOWriterCreate(const char *uri, RecordIOHandle *out);
+
+/**
+ * \brief Delete a RecordIO writer object
+ * \param handle handle to RecordIO object
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOWriterFree(RecordIOHandle handle);
+
+/**
+ * \brief Write a record to a RecordIO object
+ * \param handle handle to RecordIO object
+ * \param buf buffer to write
+ * \param size size of buffer
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOWriterWriteRecord(RecordIOHandle handle,
+                                          const char *buf, size_t size);
+
+/**
+ * \brief Get the current writer pointer position
+ * \param handle handle to RecordIO object
+ * \param pos handle to output position
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOWriterTell(RecordIOHandle handle, size_t *out);
+
+/**
+ * \brief Create a RecordIO reader object
+ * \param uri path to file
+ * \param out handle pointer to the created object
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOReaderCreate(const char *uri, RecordIOHandle *out);
+
+/**
+ * \brief Delete a RecordIO reader object
+ * \param handle handle to RecordIO object
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOReaderFree(RecordIOHandle handle);
+
+/**
+ * \brief Write a record to a RecordIO object
+ * \param handle handle to RecordIO object
+ * \param buf pointer to return buffer
+ * \param size point to size of buffer
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOReaderReadRecord(RecordIOHandle handle,
+                                        char const **out_array, size_t *out_size);
+
+/**
+ * \brief Set the current reader pointer position
+ * \param handle handle to RecordIO object
+ * \param pos target position
+ * \return 0 when success, -1 when failure happens
+*/
+int MXRecordIOReaderSeek(RecordIOHandle handle, size_t pos);
+
+/**
+ * \brief Create a MXRtc object
+*/
+int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output,
+                          char** in, char** in,
+                          NDArrayHandle* in, NDArrayHandle* in,
+                          char* kernel, RtcHandle *out);
+
+/**
+ * \brief Run cuda kernel
+*/
+int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output,
+                        NDArrayHandle* in, NDArrayHandle* in,
+                        mx_uint gridDimX,
+                        mx_uint gridDimY,
+                        mx_uint gridDimZ,
+                        mx_uint blockDimX,
+                        mx_uint blockDimY,
+                        mx_uint blockDimZ);
+
+/**
+ * \brief Delete a MXRtc object
+*/
+int MXRtcFree(RtcHandle handle);
+
+int MXCustomOpRegister(const char* op_type, CustomOpPropCreator creator);
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
new file mode 100644
index 000000000000..8e035f1b4973
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -0,0 +1,806 @@
+%typemap(in) (const char** in), (char** in)
+{
+    AV *tempav;
+    I32 len;
+    int i;
+    SV  **tv;
+    STRLEN len2;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    len = av_top_index(tempav) + 1;
+    if(len!=0) 
+    {
+        $1 = (char **) safemalloc((len)*sizeof(char *));
+        for (i = 0; i < len; i++) {
+            tv = av_fetch(tempav, i, 0);    
+            $1[i] = (char *) SvPV(*tv,len2);
+        }
+    }
+    else
+    {
+       $1 = NULL;     
+    }
+}
+%typemap(freearg) (const char** in), (char** in)  {
+    Safefree($1);
+}
+
+%typemap(in) (const char **keys, const char **vals), (char **keys, char **vals)
+{
+    HV *temphv;
+    char *key;
+    SV *val;
+    I32 len;
+    STRLEN len2;
+    int hash_len;
+    int i = 0;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVHV)
+    croak("Argument $argnum is not a hash.");
+        temphv = (HV*)SvRV($input);
+    hash_len = hv_iterinit(temphv);
+    if(hash_len)
+    {
+        $1 = (char **)safemalloc(hash_len*sizeof(char *));
+        $2 = (char **)safemalloc(hash_len*sizeof(char *));
+        while ((val = hv_iternextsv(temphv, &key, &len))) 
+        {
+            $1[i] = key;
+            $2[i] = SvPV(val, len2);
+            ++i;
+        }
+    }
+}
+%typemap(freearg) (const char **keys, const char **vals), (char **keys, char **vals) 
+{
+    Safefree($1);
+    Safefree($2);
+}
+
+%typemap(in,numinputs=0) (const char **out) (char *temp)
+{
+    temp = NULL;
+    $1 = &temp;
+}
+
+%typemap(argout) (const char **out)
+{
+    if(!result)
+    {
+        $result = newSVpv(*$1, 0);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (int *out) (int temp)
+{
+    temp = 0;
+    $1 = &temp;
+}
+
+%typemap(argout) (int *out)
+{
+    if(!result)
+    {
+        $result = newSViv(*$1);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (nn_uint *out_size, const char ***out_array) (nn_uint temp_size, char** temp),
+                         (mx_uint *out_size, const char ***out_array) (mx_uint temp_size, char** temp)
+{
+    $1 = &temp_size;
+    $2 = &temp;
+}
+
+%typemap(argout) (nn_uint *out_size, const char ***out_array),
+                 (mx_uint *out_size, const char ***out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = newSVpv((*$2)[i],0);
+            sv_2mortal(svs[i]);
+        };
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (mx_uint *out_size, const char ***out_array2) (mx_uint temp_size, char** temp)
+{
+    $1 = &temp_size;
+    $2 = &temp;
+}
+
+%typemap(argout) (mx_uint *out_size, const char ***out_array2)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *)*2);
+        for (i = 0; i < *$1*2 ; i++) {
+            svs[i] = newSVpv((*$2)[i],0);
+            sv_2mortal(svs[i]);
+        };
+        myav = av_make(*$1*2,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in) (FunctionHandle in)
+{
+    int res;
+    void **void_ptrptr = const_cast< void** >(&$1);
+    res = SWIG_ConvertPtr($input,void_ptrptr, 0, 0);
+    if (!SWIG_IsOK(res)) {
+        SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "FunctionHandle""'"); 
+    }
+}
+
+%typemap(in) (AtomicSymbolCreator in)
+{
+    int res = SWIG_ConvertPtr($input,&$1, 0, 0);
+    if (!SWIG_IsOK(res)) {
+        SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "AtomicSymbolCreator""'"); 
+    }
+}
+
+%typemap(in) (const void *in), (void *in)
+{
+    STRLEN len;
+    $1 = (void *)SvPV($input, len);
+}
+
+%typemap(in) (const char *in)
+{
+    STRLEN len;
+    $1 = SvPV($input, len);
+}
+
+%typemap(in) (const mx_uint *in), (mx_uint *in)
+{
+    AV *tempav;
+    int i;
+    SV  **tv;
+    int av_len; 
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    av_len = av_top_index(tempav) + 1;
+    if(av_len)
+    {
+        $1 = (mx_uint *)safemalloc(av_len*sizeof(mx_uint));
+        for (i = 0; i < av_len; i++) {
+            tv = av_fetch(tempav, i, 0);    
+            $1[i] = (mx_uint)SvIV(*tv);
+        }
+    }
+}
+
+%typemap(freearg) (const mx_uint *in), (mx_uint *in) {
+    Safefree($1);
+}
+
+%typemap(in) (const int *in), (int *in)
+{
+    AV *tempav;
+    int i;
+    SV  **tv;
+    int av_len; 
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    av_len = av_top_index(tempav) + 1;
+    $1 = (int *)safemalloc(av_len*sizeof(int));
+    for (i = 0; i < av_len; i++) {
+        tv = av_fetch(tempav, i, 0);    
+        $1[i] = (int)SvIV(*tv);
+    }
+}
+
+%typemap(freearg) (const int *in), (int *in) {
+    Safefree($1);
+}
+
+%typemap(in) (NDArrayHandle* in), (SymbolHandle* in)
+{
+    AV *tempav;
+    int i;
+    SV  **tv;
+    int res;
+    int av_len;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    av_len = av_top_index(tempav) + 1;
+    if(av_len)
+    {
+        $1 = ($1_type)safemalloc(av_len*sizeof($*1_type));
+        for (i = 0; i < av_len; i++) {
+            tv = av_fetch(tempav, i, 0);    
+            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), $*1_descriptor, 0);
+            if (!SWIG_IsOK(res)) {
+                SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "$*1_type""'"); 
+            }
+        }
+    }
+}
+%typemap(freearg) (NDArrayHandle* in), (SymbolHandle* in)  {
+    Safefree($1);
+}
+
+%typemap(in) (mx_float *in)
+{
+    AV *tempav;
+    int i, len;
+    SV  **tv;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    len = av_top_index(tempav) + 1;
+    if(len)
+    {
+        $1 = (mx_float *)safemalloc(len*sizeof(mx_float));
+        for (i = 0; i < len; i++) {
+            tv = av_fetch(tempav, i, 0);    
+            $1[i] = (mx_float)SvNV(*tv);
+        }
+    }
+}
+
+%typemap(freearg) (mx_float *in) {
+    Safefree($1);
+}
+%typemap(in,numinputs=0) (NDArrayHandle *out) (NDArrayHandle temp),
+                         (FunctionHandle* out) (FunctionHandle temp), 
+                         (SymbolHandle *out) (SymbolHandle temp),
+                         (ExecutorHandle *out) (ExecutorHandle temp),
+                         (DataIterHandle *out) (ExecutorHandle temp),
+                         (KVStoreHandle *out) (KVStoreHandle temp),
+                         (RecordIOHandle *out) (RecordIOHandle temp),
+                         (RtcHandle *out) (RtcHandle temp) 
+           
+{
+    $1 = &temp;
+}
+%typemap(argout) (NDArrayHandle *out), (FunctionHandle* out), (SymbolHandle *out), (ExecutorHandle *out), (DataIterHandle *out), 
+                 (KVStoreHandle *out), (RecordIOHandle *out), (RtcHandle *out) (RtcHandle temp)
+{
+    if(!result)
+    {    
+        $result =  SWIG_NewPointerObj(SWIG_as_voidptr(*$1), $*1_descriptor, 0); argvi++;
+    }
+}
+
+%typemap(in) (mx_float **out_pdata) (mx_float *temp_pdata)
+{
+    $1 = &temp_pdata;
+}
+%typemap(argout) (mx_float **out_pdata)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int len;
+        int i = 0;
+        len = SvIV($input); 
+        svs = (SV **)safemalloc(len*sizeof(SV *));
+        for (i = 0; i < len ; i++) {
+            svs[i] = newSVnv((*$1)[i]);
+            sv_2mortal(svs[i]);
+        }
+        myav = av_make(len,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (char const **out_array, size_t *out_size) (char * temp, size_t temp_size)
+{
+    $2 = &temp_size;
+    $1 = &temp;
+}
+
+%typemap(argout) (char const **out_array, size_t *out_size)
+{
+    if(!result)
+    {
+        $result = newSVpvn(*$1, *$2);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (size_t *out_size, char const **out_array) (size_t temp_size, char *temp)
+{
+    $1 = &temp_size;
+    $2 = &temp;
+}
+
+%typemap(argout) (size_t *out_size, char const **out_array)
+{
+    if(!result)
+    {
+        $result = newSVpvn(*$2, *$1);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (mx_uint *out_dim, const mx_uint **out_pdata) (mx_uint temp_dim, mx_uint *temp_pdata)
+{
+    $1 = &temp_dim;
+    $2 = &temp_pdata;
+}
+
+%typemap(argout) (mx_uint *out_dim, const mx_uint **out_pdata)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = newSViv((*$2)[i]);
+            sv_2mortal(svs[i]);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (uint64_t **out_index, uint64_t *out_size) (uint64_t *temp1, uint64_t temp2)
+{
+    $1 = &temp1;
+    $2 = &temp2;
+}
+
+%typemap(argout) (uint64_t **out_index, uint64_t *out_size)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$2*sizeof(SV *));
+        for (i = 0; i < *$2 ; i++) {
+            svs[i] = newSViv((*$1)[i]);
+            sv_2mortal(svs[i]);
+        }
+        myav = av_make(*$2,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (mx_uint *out_size, FunctionHandle** out_array) (mx_uint temp_size, FunctionHandle* temp),
+                         (mx_uint *out_size, AtomicSymbolCreator** out_array) (mx_uint temp_size, AtomicSymbolCreator* temp),
+                         (mx_uint *out_size, DataIterCreator **out_array) (mx_uint temp_size, DataIterCreator* temp),
+                         (mx_uint *out_size, NDArrayHandle** out_array) (mx_uint temp_size, NDArrayHandle* temp)
+{
+    $1 = &temp_size;
+    $2 = &temp;
+}
+
+// many argouts needed because SWIG can't $**2_mangle
+%typemap(argout) (mx_uint *out_size, AtomicSymbolCreator** out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXAtomicSymbolCreator, 0);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(argout) (mx_uint *out_size, FunctionHandle** out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXFunction, 0);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(argout) (mx_uint *out_size, DataIterCreator **out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXDataIterCreator, 0);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(argout) (mx_uint *out_size, NDArrayHandle** out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in) (int *out_size, NDArrayHandle** out_array) (int temp, NDArrayHandle* temp_array)
+{
+    AV *tempav;
+    int i;
+    SV  **tv;
+    int res;
+    int av_len;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    av_len = av_top_index(tempav) + 1;
+    temp_array = NULL;
+    if(av_len)
+    {
+        temp_array = (void**)safemalloc(av_len*sizeof(void*));
+        for (i = 0; i < av_len; i++) {
+            tv = av_fetch(tempav, i, 0);
+            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&(temp_array[i])), 0, 0);
+            if (!SWIG_IsOK(res)) {
+                SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "NDArray""'"); 
+            }
+        }
+    } 
+    temp = av_len;
+    $1 = &temp;
+    $2 = &temp_array;
+}
+
+%typemap(freearg) (int *out_size, NDArrayHandle** out_array) {
+    if(av_top_index((AV*)SvRV(ST(3))) > -1)
+    {
+        Safefree(*$2);
+    }
+}
+
+%typemap(argout) (int *out_size, NDArrayHandle** out_array)
+{
+    SV **svs;
+    int i = 0;
+    if(av_top_index((AV*)SvRV(ST(3))) == -1)
+    {
+        if(!result)
+        {
+            AV *container = newAV();
+            for (i = 0; i < *$1 ; i++) {
+                av_push(container, SvREFCNT_inc(SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_MXNDArray, 0)));
+            }
+            $result = newRV_noinc((SV*)container);
+            sv_2mortal($result);
+            argvi++;
+        }
+    }
+}
+
+%typemap(in,numinputs=0) (const char **name,
+                          const char **description,
+                          mx_uint *num_args,
+                          const char ***arg_names,
+                          const char ***arg_type_infos,
+                          const char ***arg_descriptions
+                          ) 
+                          (char *name_temp,
+                           char *desc_temp,
+                           mx_uint num_args_temp,
+                           char **names_temp,
+                           char **types_temp,
+                           char **descs_temp
+                           )
+{
+    $1 = &name_temp;
+    $2 = &desc_temp;
+    $3 = &num_args_temp;
+    $4 = &names_temp;
+    $5 = &types_temp;
+    $6 = &descs_temp;
+}
+
+%typemap(argout) (const char **name,
+                  const char **description,
+                  mx_uint *num_args,
+                  const char ***arg_names,
+                  const char ***arg_type_infos,
+                  const char ***arg_descriptions
+                  )
+{
+    if(!result)
+    {
+        AV *container, *names, *types, *descs;
+        int i;
+        container = newAV();
+        names = newAV();
+        types = newAV();
+        descs = newAV();
+        if($1) av_push(container, newSVpv(*$1,0));
+        if($2) av_push(container, newSVpv(*$2,0));
+        if($3)
+        {
+            for (i = 0; i < *$3 ; i++) {
+                av_push(names, newSVpv((*$4)[i],0));
+                av_push(types, newSVpv((*$5)[i],0));
+                av_push(descs, newSVpv((*$6)[i],0));
+            }
+        }
+        av_push(container, newRV_noinc((SV*)names));
+        av_push(container, newRV_noinc((SV*)types));
+        av_push(container, newRV_noinc((SV*)descs));
+        $result = newRV_noinc((SV*)container);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (const char **name,
+                          const char **description,
+                          mx_uint *num_args,
+                          const char ***arg_names,
+                          const char ***arg_type_infos,
+                          const char ***arg_descriptions,
+                          const char **key_var_num_args
+                          ) 
+                          (char *name_temp, 
+                           char *desc_temp, 
+                           mx_uint num_args_temp, 
+                           char **names_temp,
+                           char **types_temp,
+                           char **descs_temp,
+                           char *key_temp
+                           )
+{
+    $1 = &name_temp; 
+    $2 = &desc_temp;
+    $3 = &num_args_temp; 
+    $4 = &names_temp;
+    $5 = &types_temp;
+    $6 = &descs_temp;
+    $7 = &key_temp;
+}
+
+%typemap(argout) (const char **name,
+                  const char **description,
+                  mx_uint *num_args,
+                  const char ***arg_names,
+                  const char ***arg_type_infos,
+                  const char ***arg_descriptions,
+                  const char **key_var_num_args
+                  )
+{
+    if(!result)
+    {
+        AV *container, *names, *types, *descs;
+        int i;
+        container = newAV();
+        names = newAV();
+        types = newAV();
+        descs = newAV();
+        if($1) av_push(container, newSVpv(*$1,0));
+        if($2) av_push(container, newSVpv(*$2,0));
+        if($3)
+        {
+            for (i = 0; i < *$3 ; i++) {
+                av_push(names, newSVpv((*$4)[i],0));
+                av_push(types, newSVpv((*$5)[i],0));
+                av_push(descs, newSVpv((*$6)[i],0));                    
+            }
+        }
+        av_push(container, newRV_noinc((SV*)names));
+        av_push(container, newRV_noinc((SV*)types));
+        av_push(container, newRV_noinc((SV*)descs));
+        if($7) av_push(container, newSVpv(*$7,0));
+        $result = newRV_noinc((SV*)container);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (mx_uint *out) (mx_uint temp), (size_t *out) (size_t temp)
+{
+    $1 = &temp; 
+}
+
+%typemap(argout) (mx_uint *out), (size_t *out)
+{
+    if(!result)
+    {
+        $result = newSViv(*$1);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (mx_uint *in_shape_size, const mx_uint **in_shape_ndim, const mx_uint ***in_shape_data) 
+                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3),
+                         (mx_uint *out_shape_size, const mx_uint **out_shape_ndim, const mx_uint ***out_shape_data) 
+                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3),
+                         (mx_uint *aux_shape_size, const mx_uint **aux_shape_ndim, const mx_uint ***aux_shape_data) 
+                         (mx_uint temp1, mx_uint *temp2, mx_uint **temp3)
+{
+    $1 = &temp1; 
+    $2 = &temp2;
+    $3 = &temp3; 
+}
+
+%typemap(argout) (mx_uint *in_shape_size, const mx_uint **in_shape_ndim, const mx_uint ***in_shape_data),
+                 (mx_uint *out_shape_size, const mx_uint **out_shape_ndim, const mx_uint ***out_shape_data),
+                 (mx_uint *aux_shape_size, const mx_uint **aux_shape_ndim, const mx_uint ***aux_shape_data)
+{
+    if(!result && *arg15)
+    {
+        AV *container;
+        AV *tmp;
+        int i, j;
+        container = newAV();
+        for (i = 0; i < *$1 ; i++)
+        {
+            tmp = newAV();
+            int len = (*$2)[i];
+            for (j = 0; j < len ; j++)
+            {
+                av_push(tmp, newSViv((*$3)[i][j]));
+            }
+            av_push(container, newRV((SV*)tmp));
+        }
+        $result = newRV_noinc((SV*)container);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (mx_uint *in_type_size, const int **in_type_data)
+                         (mx_uint temp1, int *temp2),
+                         (mx_uint *out_type_size, const int **out_type_data) 
+                         (mx_uint temp1, int *temp2),
+                         (mx_uint *aux_type_size, const int **aux_type_data) 
+                         (mx_uint temp1, int *temp2)
+{
+    $1 = &temp1; 
+    $2 = &temp2;
+}
+
+%typemap(argout)  (mx_uint *in_type_size,  const int **in_type_data),
+                  (mx_uint *out_type_size, const int **out_type_data), 
+                  (mx_uint *aux_type_size, const int **aux_type_data) 
+
+{
+    if(!result && *arg11)
+    {
+        AV *container;
+        int i;
+        container = newAV();
+        for (i = 0; i < *$1 ; i++) 
+        {
+            av_push(container, newSViv((*$2)[i]));
+        }
+        $result = newRV_noinc((SV*)container);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in) (uint32_t x)
+{
+    union fbits u;
+    u.f = SvNV($input);
+    $1 = u.x;
+}
+
+%typemap(out) (uint16_t)
+{
+    $result = newSViv($1);
+    sv_2mortal($result);
+    argvi++;
+}
+
+%typemap(in) (uint16_t x)
+{
+    $1 = SvIV($input);
+}
+
+%typemap(out) (uint32_t)
+{
+    union fbits u;
+    u.x = $1;
+    $result = newSVnv(u.f);
+    sv_2mortal($result);
+    argvi++;
+}
+
+%typemap(in,numinputs=0) (MXKVStoreUpdater* updater)
+{
+    $1 = KVStore_callback;
+}
+
+%typemap(in,numinputs=0) (MXKVStoreServerController* controller)
+{
+    $1 = KVStoreServer_callback;
+}
+
+%typemap(in,numinputs=0) (ExecutorMonitorCallback callback)
+{
+    $1 = ExecutorMonitor_callback;
+}
+
+%typemap(in) (void* callback_handle)
+{
+    $1 = (void*)$input;
+}
diff --git a/perl-package/AI-MXNetCAPI/t/AI-MXNetCAPI.t b/perl-package/AI-MXNetCAPI/t/AI-MXNetCAPI.t
new file mode 100644
index 000000000000..f2ff0098be8a
--- /dev/null
+++ b/perl-package/AI-MXNetCAPI/t/AI-MXNetCAPI.t
@@ -0,0 +1,5 @@
+use strict;
+use warnings;
+use Test::More tests => 1;
+BEGIN { use_ok('AI::MXNetCAPI') };
+
diff --git a/perl-package/AI-NNVMCAPI/Changes b/perl-package/AI-NNVMCAPI/Changes
new file mode 100644
index 000000000000..6539ee0e4ef9
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/Changes
@@ -0,0 +1,14 @@
+Revision history for Perl extension AI::NNVMCAPI.
+
+0.95  Sun Mar 26 17:42:02 PDT 2017
+        - visible on http://mxnet.io
+
+0.03  Sat Feb 25 13:21:07 PST 2017
+	- sync up with the Python interface.
+
+0.02  Tue Feb 14 07:28:11 PST 2017
+        - prepared for inclusion to the mxnet code repository.
+
+0.01  Fri Jan  6 19:40:53 2017
+	- original version
+
diff --git a/perl-package/AI-NNVMCAPI/MANIFEST b/perl-package/AI-NNVMCAPI/MANIFEST
new file mode 100644
index 000000000000..4cbdbafb3abc
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/MANIFEST
@@ -0,0 +1,10 @@
+Changes
+Makefile.PL
+MANIFEST
+META.json
+META.yml
+README
+t/AI-NNVMCAPI.t
+lib/AI/NNVMCAPI.pm
+nnvm.i
+nnvm_typemaps.i
diff --git a/perl-package/AI-NNVMCAPI/META.json b/perl-package/AI-NNVMCAPI/META.json
new file mode 100644
index 000000000000..7c0329d4c359
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/META.json
@@ -0,0 +1,41 @@
+{
+   "abstract" : "Swig interface to nnvm c api",
+   "author" : [
+      "Sergey Kolychev <sergeykolychev.github@gmail.com>"
+   ],
+   "dynamic_config" : 0,
+   "generated_by" : "ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240",
+   "license" : [
+      "apache_2_0"
+   ],
+   "meta-spec" : {
+      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
+      "version" : "2"
+   },
+   "name" : "AI-NNVMCAPI",
+   "no_index" : {
+      "directory" : [
+         "t",
+         "inc"
+      ]
+   },
+   "prereqs" : {
+      "build" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "configure" : {
+         "requires" : {
+            "ExtUtils::MakeMaker" : "0"
+         }
+      },
+      "runtime" : {
+         "requires" : {
+            "Test::More" : "0"
+         }
+      }
+   },
+   "release_status" : "stable",
+   "version" : "0.95"
+}
diff --git a/perl-package/AI-NNVMCAPI/META.yml b/perl-package/AI-NNVMCAPI/META.yml
new file mode 100644
index 000000000000..d43e8ee22389
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/META.yml
@@ -0,0 +1,22 @@
+---
+abstract: 'Swig interface to nnvm c api'
+author:
+  - 'Sergey Kolychev <sergeykolychev.github@gmail.com>'
+build_requires:
+  ExtUtils::MakeMaker: '0'
+configure_requires:
+  ExtUtils::MakeMaker: '0'
+dynamic_config: 0
+generated_by: 'ExtUtils::MakeMaker version 7.24, CPAN::Meta::Converter version 2.143240'
+license: apache
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: '1.4'
+name: AI-NNVMCAPI
+no_index:
+  directory:
+    - t
+    - inc
+requires:
+  Test::More: '0'
+version: '0.95'
diff --git a/perl-package/AI-NNVMCAPI/Makefile.PL b/perl-package/AI-NNVMCAPI/Makefile.PL
new file mode 100644
index 000000000000..8dbafff211af
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/Makefile.PL
@@ -0,0 +1,42 @@
+use ExtUtils::MakeMaker;
+`swig -noproxy -c++ -perl nnvm.i`;
+unlink "NNVMCAPI.pm";
+my @tmp = split(/ /, $ExtUtils::MakeMaker::Config{lddlflags});
+my @lddlflags;
+while(my $flag = shift(@tmp))
+{
+   if($flag eq '-arch')
+   {
+      my $arch = shift(@tmp);
+      if($arch eq 'i386')
+      {
+         next;
+      }
+      else
+      {
+         push @lddlflags, ($flag, $arch);
+      }
+   }
+   else
+   {
+      push @lddlflags, $flag;
+   }
+}
+
+WriteMakefile(
+    NAME            => 'AI::NNVMCAPI',
+    LICENSE         => 'apache_2_0',
+    AUTHOR          => 'Sergey Kolychev <sergeykolychev.github@gmail.com>',
+    VERSION_FROM    => 'lib/AI/NNVMCAPI.pm',
+    ABSTRACT_FROM   => 'lib/AI/NNVMCAPI.pm',
+    LIBS           => ['-L../../lib -lmxnet'],
+    INC            => '-I../../nnvm/include/nnvm',
+    OBJECT         => 'nnvm_wrap.o',
+    LDDLFLAGS      => join(' ', @lddlflags),
+    PREREQ_PM      => {
+        # prereqs
+        # build/test prereqs
+        'Test::More'   => 0,
+    },
+    PL_FILES        => {},
+);
diff --git a/perl-package/AI-NNVMCAPI/README b/perl-package/AI-NNVMCAPI/README
new file mode 100644
index 000000000000..fa7870d301ee
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/README
@@ -0,0 +1,26 @@
+AI-NNVMCAPI version 0.95
+=====================
+
+Swig interface to MXNet c api.
+
+INSTALLATION
+
+To install this module type the following:
+
+   perl Makefile.PL
+   make
+   make test
+   make install
+
+DEPENDENCIES
+
+This module requires mxnet http://mxnet.io
+It's used by AI::MXNet.
+
+COPYRIGHT AND LICENCE
+
+Copyright (C) 2017 by Sergey Kolychev <sergeykolychev.github@gmail.com>
+
+This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
+
+
diff --git a/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm b/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
new file mode 100644
index 000000000000..577f7dad2faf
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
@@ -0,0 +1,37 @@
+package AI::NNVMCAPI;
+use base qw(DynaLoader);
+bootstrap AI::NNVMCAPI;
+our $VERSION = '0.95';
+1;
+__END__
+
+=head1 NAME
+
+AI::NNVMCAPI - Swig interface to nnvm c api
+
+=head1 SYNOPSIS
+
+ use AI::NNVMCAPI;
+
+=head1 DESCRIPTION
+
+This module provides interface to nnvm
+via its api.
+
+=head1 SEE ALSO
+
+L<AI::MXNet>
+
+=head1 AUTHOR
+
+Sergey Kolychev, <sergeykolychev.github@gmail.com>
+
+=head1 COPYRIGHT & LICENSE
+
+Copyright 2017 Sergey Kolychev.
+
+This library is licensed under Apache 2.0 license.
+
+See https://www.apache.org/licenses/LICENSE-2.0 for more information.
+
+=cut
diff --git a/perl-package/AI-NNVMCAPI/nnvm.i b/perl-package/AI-NNVMCAPI/nnvm.i
new file mode 100644
index 000000000000..28ac1cf76bbb
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/nnvm.i
@@ -0,0 +1,345 @@
+%module  "AI::NNVMCAPI"
+%include typemaps.i
+%rename("%(strip:[NN])s") "";
+%init %{
+    /* These SWIG_TypeClientData() calls might break in the future, but
+     * %rename should work on these types before that happens. */
+    SWIG_TypeClientData(SWIGTYPE_p_NNOp, (void *)"OpHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_NNSymbol, (void *)"SymbolHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_NNGraph, (void *)"GraphHandle");
+%}
+%inline %{
+#include <c_api.h>
+%} 
+%include nnvm_typemaps.i
+
+/*! \brief manually define unsigned int */
+typedef unsigned int nn_uint;
+/*! \brief handle to a function that takes param and creates symbol */
+typedef NNOp *OpHandle;
+/*! \brief handle to a symbol that can be bind as operator */
+typedef NNSymbol *SymbolHandle;
+/*! \brief handle to Graph */
+typedef NNGraph *GraphHandle;
+/*!
+ * \brief Set the last error message needed by C API
+ * \param msg The error message to set.
+ */
+void NNAPISetLastError(const char* msg);
+/*!
+ * \brief return str message of the last error
+ *  all function in this file will return 0 when success
+ *  and -1 when an error occured,
+ *  NNGetLastError can be called to retrieve the error
+ *
+ *  this function is threadsafe and can be called by different thread
+ *  \return error info
+ */
+const char *NNGetLastError(void);
+/*!
+ * \brief list all the available operator names, include entries.
+ * \param out_size the size of returned array
+ * \param out_array the output operator name array.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNListAllOpNames(nn_uint *out_size, const char*** out_array);
+/*!
+ * \brief Get operator handle given name.
+ * \param op_name The name of the operator.
+ * \param op_out The returnning op handle.
+ */
+int NNGetOpHandle(const char* op_name,
+                           OpHandle* out);
+/*!
+ * \brief list all the available operators.
+ *  This won't include the alias, use ListAllNames
+ *  instead to get all alias names.
+ *
+ * \param out_size the size of returned array
+ * \param out_array the output AtomicSymbolCreator array
+ * \return 0 when success, -1 when failure happens
+ */
+int NNListUniqueOps(nn_uint *out_size,
+                             OpHandle **out_array);
+/*!
+ * \brief Get the detailed information about atomic symbol.
+ * \param op The operator handle.
+ * \param real_name The returned name of the creator.
+ *   This name is not the alias name of the atomic symbol.
+ * \param description The returned description of the symbol.
+ * \param num_doc_args Number of arguments that contain documents.
+ * \param arg_names Name of the arguments of doc args
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \param return_type Return type of the function, if any.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGetOpInfo(OpHandle op,
+                         const char **real_name,
+                         const char **description,
+                         nn_uint *num_doc_args,
+                         const char ***arg_names,
+                         const char ***arg_type_infos,
+                         const char ***arg_descriptions,
+                         const char **return_type);
+/*!
+ * \brief Create an AtomicSymbol functor.
+ * \param op The operator handle
+ * \param num_param the number of parameters
+ * \param keys the keys to the params
+ * \param vals the vals of the params
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolCreateAtomicSymbol(OpHandle op,
+                                        nn_uint num_param,
+                                        const char **keys,
+                                        const char **vals,
+                                        SymbolHandle *out);
+/*!
+ * \brief Create a Variable Symbol.
+ * \param name name of the variable
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolCreateVariable(const char *name, SymbolHandle *out);
+/*!
+ * \brief Create a Symbol by grouping list of symbols together
+ * \param num_symbols number of symbols to be grouped
+ * \param symbols array of symbol handles
+ * \param out pointer to the created symbol handle
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolCreateGroup(nn_uint num_symbols,
+                                 SymbolHandle *in,
+                                 SymbolHandle *out);
+/*!
+ * \brief Add src_dep to the handle as control dep.
+ * \param handle The symbol to add dependency edges on.
+ * \param src_dep the source handles.
+ */
+int NNAddControlDeps(SymbolHandle in,
+                              SymbolHandle in);
+/*!
+ * \brief Free the symbol handle.
+ * \param symbol the symbol
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolFree(SymbolHandle in);
+/*!
+ * \brief Copy the symbol to another handle
+ * \param symbol the source symbol
+ * \param out used to hold the result of copy
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolCopy(SymbolHandle in, SymbolHandle *out);
+/*!
+ * \brief Print the content of symbol, used for debug.
+ * \param symbol the symbol
+ * \param out_str pointer to hold the output string of the printing.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolPrint(SymbolHandle in, const char **out);
+/*!
+ * \brief Get string attribute from symbol
+ * \param symbol the source symbol
+ * \param key The key of the symbol.
+ * \param out The result attribute, can be NULL if the attribute do not exist.
+ * \param success Whether the result is contained in out.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolGetAttr(SymbolHandle in,
+                             const char* key,
+                             const char** out,
+                             int *out);
+/*!
+ * \brief Set string attribute from symbol.
+ *  NOTE: Setting attribute to a symbol can affect the semantics(mutable/immutable) of symbolic graph.
+ *
+ *  Safe recommendaton: use  immutable graph
+ *  - Only allow set attributes during creation of new symbol as optional parameter
+ *
+ *  Mutable graph (be careful about the semantics):
+ *  - Allow set attr at any point.
+ *  - Mutating an attribute of some common node of two graphs can cause confusion from user.
+ *
+ * \param symbol the source symbol
+ * \param num_param Number of parameters to set.
+ * \param keys The keys of the attribute
+ * \param values The value to be set
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolSetAttrs(SymbolHandle in,
+                              nn_uint num_param,
+                              const char** keys,
+                              const char** vals);
+/*!
+ * \brief Get all attributes from symbol, including all descendents.
+ * \param symbol the source symbol
+ * \param recursive_option 0 for recursive, 1 for shallow.
+ * \param out_size The number of output attributes
+ * \param out 2*out_size strings representing key value pairs.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolListAttrs(SymbolHandle in,
+                               int recursive_option,
+                               nn_uint *half_of_out_size,
+                               const char*** out_array);
+/*!
+ * \brief List inputs variables in the symbol.
+ * \param symbol the symbol
+ * \param option The option to list the inputs
+ *   option=0 means list all arguments.
+ *   option=1 means list arguments that are readed only by the graph.
+ *   option=2 means list arguments that are mutated by the graph.
+ * \param out_size output size
+ * \param out_sym_array the output array.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolListInputVariables(SymbolHandle in,
+                                        int option,
+                                        nn_uint *out_size,
+                                        SymbolHandle** out_array);
+/*!
+ * \brief List input names in the symbol.
+ * \param symbol the symbol
+ * \param option The option to list the inputs
+ *   option=0 means list all arguments.
+ *   option=1 means list arguments that are readed only by the graph.
+ *   option=2 means list arguments that are mutated by the graph.
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolListInputNames(SymbolHandle in,
+                                    int option,
+                                    nn_uint *out_size,
+                                    const char ***out_array);
+/*!
+ * \brief List returns names in the symbol.
+ * \param symbol the symbol
+ * \param out_size output size
+ * \param out_str_array pointer to hold the output string array
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolListOutputNames(SymbolHandle in,
+                                     nn_uint *out_size,
+                                     const char ***out_array);
+/*!
+ * \brief Get a symbol that contains all the internals.
+ * \param symbol The symbol
+ * \param out The output symbol whose outputs are all the internals.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolGetInternals(SymbolHandle in,
+                                  SymbolHandle *out);
+/*!
+ * \brief Get index-th outputs of the symbol.
+ * \param symbol The symbol
+ * \param index the Index of the output.
+ * \param out The output symbol whose outputs are the index-th symbol.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolGetOutput(SymbolHandle in,
+                               nn_uint index,
+                               SymbolHandle *out);
+
+/*!
+ * \brief Compose the symbol on other symbols.
+ *
+ *  This function will change the sym hanlde.
+ *  To achieve function apply behavior, copy the symbol first
+ *  before apply.
+ *
+ * \param sym the symbol to apply
+ * \param name the name of symbol
+ * \param num_args number of arguments
+ * \param keys the key of keyword args (optional)
+ * \param symbols arguments to sym
+ * \return 0 when success, -1 when failure happens
+ */
+int NNSymbolCompose(SymbolHandle in,
+                             const char* name,
+                             nn_uint num_args,
+                             const char** in,
+                             SymbolHandle* in);
+
+// Graph IR API
+/*!
+ * \brief create a graph handle from symbol
+ * \param symbol The symbol representing the graph.
+ * \param graph The graph handle created.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGraphCreate(SymbolHandle in, GraphHandle *out);
+/*!
+ * \brief free the graph handle
+ * \param handle The handle to be freed.
+ */
+int NNGraphFree(GraphHandle handle);
+/*!
+ * \brief Get a new symbol from the graph.
+ * \param graph The graph handle.
+ * \param symbol The corresponding symbol
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGraphGetSymbol(GraphHandle graph, SymbolHandle *out);
+
+/*!
+ * \brief Get Set a attribute in json format.
+ * This feature allows pass graph attributes back and forth in reasonable speed.
+ *
+ * \param handle The graph handle.
+ * \param key The key to the attribute.
+ * \param json_value The value need to be in format [type_name, value],
+ *  Where type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGraphSetJSONAttr(GraphHandle handle,
+                                const char* key,
+                                const char* json_value);
+
+
+/*!
+ * \brief Get a serialized attrirbute from graph.
+ * This feature allows pass graph attributes back and forth in reasonable speed.
+ *
+ * \param handle The graph handle.
+ * \param key The key to the attribute.
+ * \param json_out The result attribute, can be NULL if the attribute do not exist.
+ *  The json_out is an array of [type_name, value].
+ *  Where the type_name is a registered type string in C++ side via DMLC_JSON_ENABLE_ANY.
+ * \param success Whether the result is contained in out.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGraphGetJSONAttr(SymbolHandle in,
+                                const char* key,
+                                const char** out,
+                                int *out);
+
+/*!
+ * \brief Set a attribute whose type is std::vector<NodeEntry> in c++
+ * This feature allows pass List of symbolic variables for gradient request.
+ *
+ * \note This is beta feature only used for test purpos
+ *
+ * \param handle The graph handle.
+ * \param key The key to the attribute.
+ * \param list The symbol whose outputs represents the list of NodeEntry to be passed.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGraphSetNodeEntryListAttr_(GraphHandle handle,
+                                          const char* key,
+                                          SymbolHandle in);
+/*!
+ * \brief Apply passes on the src graph.
+ * \param src The source graph handle.
+ * \param num_pass The number of pass to be applied.
+ * \param pass_names The names of the pass.
+ * \param dst The result graph.
+ * \return 0 when success, -1 when failure happens
+ */
+int NNGraphApplyPasses(GraphHandle src,
+                                nn_uint num_pass,
+                                const char** in,
+                                GraphHandle *out);
diff --git a/perl-package/AI-NNVMCAPI/nnvm_typemaps.i b/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
new file mode 100644
index 000000000000..19baf804e187
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
@@ -0,0 +1,316 @@
+%typemap(in) (const char** in), (char** in)
+{
+    AV *tempav;
+    I32 len;
+    int i;
+    SV  **tv;
+    STRLEN len2;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    len = av_top_index(tempav) + 1;
+    if(len!=0) 
+    {
+        $1 = (char **) safemalloc((len)*sizeof(char *));
+        for (i = 0; i < len; i++) {
+            tv = av_fetch(tempav, i, 0);    
+            $1[i] = (char *) SvPV(*tv,len2);
+        }
+    }
+    else
+    {
+       $1 = NULL;     
+    }
+}
+%typemap(freearg) (const char** in), (char** in)  {
+    Safefree($1);
+}
+
+%typemap(in) (const char **keys, const char **vals), (char **keys, char **vals)
+{
+    HV *temphv;
+    char *key;
+    SV *val;
+    I32 len;
+    STRLEN len2;
+    int hash_len;
+    int i = 0;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVHV)
+    croak("Argument $argnum is not a hash.");
+        temphv = (HV*)SvRV($input);
+    hash_len = hv_iterinit(temphv);
+    if(hash_len)
+    {
+        $1 = (char **)safemalloc(hash_len*sizeof(char *));
+        $2 = (char **)safemalloc(hash_len*sizeof(char *));
+        while ((val = hv_iternextsv(temphv, &key, &len))) 
+        {
+            $1[i] = key;
+            $2[i] = SvPV(val, len2);
+            ++i;
+        }
+    }
+}
+%typemap(freearg) (const char **keys, const char **vals), (char **keys, char **vals) 
+{
+    Safefree($1);
+    Safefree($2);
+}
+
+%typemap(in,numinputs=0) (const char **out) (char *temp)
+{
+    $1 = &temp;
+}
+
+%typemap(argout) (const char **out)
+{
+    if(!result)
+    {
+        $result = newSVpv(*$1, 0);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (int *out) (int temp)
+{
+    $1 = &temp;
+}
+
+%typemap(argout) (int *out)
+{
+    if(!result)
+    {
+        $result = newSViv(*$1);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (nn_uint *out_size, const char ***out_array) (nn_uint temp_size, char** temp),
+                         (mx_uint *out_size, const char ***out_array) (mx_uint temp_size, char** temp)
+{
+    $1 = &temp_size;
+    $2 = &temp;
+}
+
+%typemap(argout) (nn_uint *out_size, const char ***out_array),
+                 (mx_uint *out_size, const char ***out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = newSVpv((*$2)[i],0);
+            sv_2mortal(svs[i]);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (nn_uint *half_of_out_size, const char ***out_array) (nn_uint temp_size, char **temp)
+{
+    $1 = &temp_size;
+    $2 = &temp;
+}
+%typemap(argout) (nn_uint *half_of_out_size, const char ***out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < (*$1)*2 ; i++) {
+            svs[i] = newSVpv((*$2)[i],0);
+            sv_2mortal(svs[i]);
+        };
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in) (SymbolHandle *in)
+{
+    AV *tempav;
+    int i;
+    SV  **tv;
+    int res;
+    int len;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    len = av_top_index(tempav) + 1;
+    if(len)
+    {
+        $1 = ($1_type)safemalloc(len*sizeof($*1_type));
+        for (i = 0; i < len; i++) {
+            tv = av_fetch(tempav, i, 0);    
+            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), 0, 0);
+            if (!SWIG_IsOK(res)) {
+                SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "$*1_type""'"); 
+            }
+        }
+    }
+}
+
+%typemap(freearg) (SymbolHandle *in) {
+    Safefree($1);
+}
+
+
+%typemap(in,numinputs=0) (const char **real_name,
+                          const char **description,
+                          nn_uint *num_doc_args,
+                          const char ***arg_names,
+                          const char ***arg_type_infos,
+                          const char ***arg_descriptions,
+                          const char **return_type
+                          ) 
+                          (char *name_temp, 
+                           char *desc_temp, 
+                           nn_uint num_args_temp, 
+                           char **names_temp,
+                           char **types_temp,
+                           char **descs_temp,
+                           char *return_temp
+                          )
+{
+    $1 = &name_temp; 
+    $2 = &desc_temp;
+    $3 = &num_args_temp; 
+    $4 = &names_temp;
+    $5 = &types_temp;
+    $6 = &descs_temp;
+    $7 = &return_temp;
+}
+%typemap(argout) (const char **real_name,
+                  const char **description,
+                  nn_uint *num_doc_args,
+                  const char ***arg_names,
+                  const char ***arg_type_infos,
+                  const char ***arg_descriptions,
+                  const char **return_type
+                  )
+{
+    if(!result)
+    {
+        AV *container, *names, *types, *descs;
+        int i;
+        container = newAV();
+        names = newAV();
+        types = newAV();
+        descs = newAV();
+        if($1) av_push(container, newSVpv(*$1,0));
+        if($2) av_push(container, newSVpv(*$2,0));
+        if($3)
+        {
+            for (i = 0; i < *$3 ; i++) {
+                av_push(names, newSVpv((*$4)[i],0));
+                av_push(types, newSVpv((*$5)[i],0));
+                av_push(descs, newSVpv((*$6)[i],0));                    
+            }
+        }
+        av_push(container, newRV_noinc((SV*)names));
+        av_push(container, newRV_noinc((SV*)types));
+        av_push(container, newRV_noinc((SV*)descs));
+        if($7) av_push(container, newSVpv(*$7,0));
+        $result = newRV_noinc((SV*)container);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (OpHandle *out) (OpHandle temp), 
+                         (SymbolHandle *out) (SymbolHandle temp),
+                         (GraphHandle *out) (GraphHandle temp)
+{
+    $1 = &temp;
+}
+%typemap(argout) (OpHandle *out) 
+{
+    if(!result)
+    {
+        $result =  SWIG_NewPointerObj(SWIG_as_voidptr(*$1), $*1_descriptor, 0); argvi++;
+    }
+}
+
+%typemap(argout) (SymbolHandle *out), (GraphHandle *out) 
+{
+    if(!result)
+    {
+        $result =  SWIG_NewPointerObj(SWIG_as_voidptr(*$1), $*1_descriptor, 0); argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (nn_uint *out_size, OpHandle** out_array) (nn_uint temp_num, OpHandle* temp)
+{
+    $1 = &temp_num;
+    $2 = &temp;
+}
+%typemap(argout) (nn_uint *out_size, OpHandle** out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_NNOp, 0);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (nn_uint *out_size, SymbolHandle** out_array) (nn_uint temp_num, SymbolHandle* temp)
+{
+    $1 = &temp_num;
+    $2 = &temp;
+}
+%typemap(argout) (nn_uint *out_size, SymbolHandle** out_array)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        svs = (SV **)safemalloc(*$1*sizeof(SV *));
+        for (i = 0; i < *$1 ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$2)[i]), SWIGTYPE_p_NNSymbol, 0);
+        }
+        myav = av_make(*$1,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in) (SymbolHandle in)
+{
+    int res = SWIG_ConvertPtr($input,&$1, 0, 0);
+    if (!SWIG_IsOK(res)) {
+        SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "SymbolHandle""'"); 
+    }
+}
diff --git a/perl-package/AI-NNVMCAPI/t/AI-NNVMCAPI.t b/perl-package/AI-NNVMCAPI/t/AI-NNVMCAPI.t
new file mode 100644
index 000000000000..bdb5296d4682
--- /dev/null
+++ b/perl-package/AI-NNVMCAPI/t/AI-NNVMCAPI.t
@@ -0,0 +1,5 @@
+use strict;
+use warnings;
+use Test::More tests => 1;
+BEGIN { use_ok('AI::NNVMCAPI') };
+
diff --git a/perl-package/README.md b/perl-package/README.md
new file mode 100644
index 000000000000..d6efad1949b0
--- /dev/null
+++ b/perl-package/README.md
@@ -0,0 +1,19 @@
+[![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
+[![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.readthedocs.io/en/latest/api/perl/index.html)
+
+You have found MXNet Perl Package! The MXNet Perl packages brings flexible and efficient GPU
+computing and state-of-art deep learning to Perl.
+
+- It enables you to write seamless tensor/matrix computation with multiple GPUs in Perl.
+- It also enables you to construct and customize state-of-art deep learning models in Perl,
+  and apply them to tasks such as image classification and data science challenges.
+
+Installation
+---------
+* [MXNet Setup Document](http://mxnet.io/get_started/setup.html)
+  - Check this out for detailed documents, examples and installation guides.
+
+License
+-------
+Copyright (C) 2017 by Sergey Kolychev <sergeykolychev.github@gmail.com>
+This library is licensed under Apache 2.0 license https://www.apache.org/licenses/LICENSE-2.0
diff --git a/plugin/caffe/caffe.mk b/plugin/caffe/caffe.mk
index ef9ec0a45a32..d1d5c677f8b9 100644
--- a/plugin/caffe/caffe.mk
+++ b/plugin/caffe/caffe.mk
@@ -1,4 +1,4 @@
-CFLAGS += -I$(CAFFE_PATH)/include -I$(CAFFE_PATH)/build/src
+CFLAGS += -I$(CAFFE_PATH)/include -I$(CAFFE_PATH)/build/src -I$(CAFFE_PATH)/build/include
 LDFLAGS += -lprotobuf -lboost_system -lboost_thread -lboost_filesystem -lgflags -lglog -L$(CAFFE_PATH)/build/lib -lcaffe
 
 ifeq ($(USE_CUDNN), 1)
diff --git a/plugin/caffe/caffe_blob.cc b/plugin/caffe/caffe_blob.cc
index dded9ca653f7..c6d5156ffbc4 100644
--- a/plugin/caffe/caffe_blob.cc
+++ b/plugin/caffe/caffe_blob.cc
@@ -2,7 +2,7 @@
  * Copyright (c) 2016 by Contributors
  * \file caffe_blob.cc
  * \brief Implementations of SetDataGradToBlob given various device/dimension
- * \author Haoran Wang 
+ * \author Haoran Wang
 */
 #include "caffe_blob.h"
 namespace mxnet {
@@ -17,7 +17,7 @@ void SetDataGradToBlob<mshadow::cpu, float>(caffeMemoryTypes memType,
   if (memType == Data)
     (*blob)->set_cpu_data(data_ptr);
   else
-    (*blob)->set_cpu_diff(data_ptr);
+    MXCAFFEBLOB(*blob, float)->set_cpu_diff(data_ptr);
 }
 
 template<>
@@ -28,7 +28,7 @@ void SetDataGradToBlob<mshadow::cpu, double>(caffeMemoryTypes memType,
   if (memType == Data)
     (*blob)->set_cpu_data(data_ptr);
   else
-    (*blob)->set_cpu_diff(data_ptr);
+    MXCAFFEBLOB(*blob, double)->set_cpu_diff(data_ptr);
 }
 
 template<>
@@ -39,7 +39,7 @@ void SetDataGradToBlob<mshadow::gpu, float>(caffeMemoryTypes memType,
   if (memType == Data)
     (*blob)->set_gpu_data(data_ptr);
   else
-    (*blob)->set_gpu_diff(data_ptr);
+    MXCAFFEBLOB(*blob, float)->set_gpu_diff(data_ptr);
 }
 
 template<>
@@ -50,7 +50,7 @@ void SetDataGradToBlob<mshadow::gpu, double>(caffeMemoryTypes memType,
   if (memType == Data)
     (*blob)->set_gpu_data(data_ptr);
   else
-    (*blob)->set_gpu_diff(data_ptr);
+    MXCAFFEBLOB(*blob, double)->set_gpu_diff(data_ptr);
 }
 
 TShape Vector2TShape(const std::vector<int> &vec_int) {
diff --git a/plugin/caffe/caffe_blob.h b/plugin/caffe/caffe_blob.h
index 24bf46e95638..3037031ad991 100644
--- a/plugin/caffe/caffe_blob.h
+++ b/plugin/caffe/caffe_blob.h
@@ -2,7 +2,7 @@
  * Copyright (c) 2016 by Contributors
  * \file caffe_blob.h
  * \brief conversion between tensor and caffeBlob
- * \author Haoran Wang 
+ * \author Haoran Wang
 */
 #ifndef PLUGIN_CAFFE_CAFFE_BLOB_H_
 #define PLUGIN_CAFFE_CAFFE_BLOB_H_
@@ -52,6 +52,45 @@ void SetOpBlobs(::caffe::Layer<Dtype> *caffeOp,
     caffeOp->blobs()[i].reset(weights[i]);
 }
 
+/**!
+ * \brief Workaround for missing functions in ::caffe::Blob
+ * \warning Do not add or override any virtual functions in this class
+ * @tparam Dtype
+ */
+template<class Dtype>
+class CaffeBlobFriend : public ::caffe::Blob<Dtype> {
+ public:
+  inline void set_cpu_diff(Dtype* diff) {
+    CHECK(diff);
+    this->diff_->set_cpu_data(diff);
+  }
+
+  inline void set_gpu_diff(Dtype* diff) {
+    CHECK(diff);
+    this->diff_->set_gpu_data(diff);
+  }
+};
+
+#define MXCAFFEBLOB(__object$, __type$) \
+  (static_cast<mxnet::op::caffe::CaffeBlobFriend<__type$> *>(__object$))
+
+/**!
+ * \brief Workaround for missing functions in ::caffe::Layer
+ * \warning Do not add or override any virtual functions in this class
+ * @tparam Dtype
+ */
+template <typename Dtype>
+class CaffeLayerFriend : public ::caffe::Layer<Dtype> {
+  explicit CaffeLayerFriend(const ::caffe::LayerParameter& param) = delete;
+ public:
+  inline void SetPhase(::caffe::Phase p) {
+    this->phase_ = p;
+  }
+};
+
+#define MXCAFFELAYER(__object$, __type$) \
+  (static_cast<mxnet::op::caffe::CaffeLayerFriend<__type$> *>(__object$))
+
 }  // namespace caffe
 }  // namespace op
 }  // namespace mxnet
diff --git a/plugin/caffe/caffe_common.h b/plugin/caffe/caffe_common.h
index 27285e1a5da2..6ee3c26202b9 100644
--- a/plugin/caffe/caffe_common.h
+++ b/plugin/caffe/caffe_common.h
@@ -9,6 +9,10 @@
 #define PLUGIN_CAFFE_CAFFE_COMMON_H_
 
 #include <mxnet/operator.h>
+#include <dmlc/type_traits.h>
+
+#include <caffe/proto/caffe.pb.h>
+
 #include <vector>
 #include <iostream>
 #include <exception>
@@ -65,4 +69,10 @@ class LayerRegistry {
 }  // namespace caffe
 }  // namespace op
 }  // namespace mxnet
+
+/*! \brief override type_name for caffe::LayerParameter */
+namespace dmlc {
+  DMLC_DECLARE_TYPE_NAME(::caffe::LayerParameter, "caffe-layer-parameter")
+}
+
 #endif  // PLUGIN_CAFFE_CAFFE_COMMON_H_
diff --git a/plugin/caffe/caffe_data_iter.cc b/plugin/caffe/caffe_data_iter.cc
index 1e42d55c9cba..ecf776270a91 100644
--- a/plugin/caffe/caffe_data_iter.cc
+++ b/plugin/caffe/caffe_data_iter.cc
@@ -14,7 +14,6 @@
 #include "caffe_blob.h"
 #include "../../src/io/inst_vector.h"
 #include "../../src/io/iter_prefetcher.h"
-#include "../../src/operator/cast-inl.h"
 
 #define CHECK_NEXT_TIMING
 
diff --git a/plugin/caffe/caffe_fieldentry.h b/plugin/caffe/caffe_fieldentry.h
index 4f92e6691751..a020cf9d7e77 100644
--- a/plugin/caffe/caffe_fieldentry.h
+++ b/plugin/caffe/caffe_fieldentry.h
@@ -2,7 +2,7 @@
  * Copyright (c) 2016 by Contributors
  * \file caffe_fieldentry.h
  * \brief Implement FieldEntry<caffe::LayerParameter>
- * \author Haoran Wang 
+ * \author Haoran Wang
  */
 #ifndef PLUGIN_CAFFE_CAFFE_FIELDENTRY_H_
 #define PLUGIN_CAFFE_CAFFE_FIELDENTRY_H_
@@ -65,7 +65,13 @@ class FieldEntry<caffe::LayerParameter>
   }
 
   virtual void PrintDefaultValueString(std::ostream &os) const {  // NOLINT(*)
-    os << '\'' << default_value_.name().c_str() << '\'';
+    std::string s;
+    caffe::NetParameter np;
+    // Avoid wasting time making a copy -- just push in out default object's pointer
+    np.mutable_layer()->AddAllocated(const_cast<::caffe::LayerParameter *>(&default_value_));
+    google::protobuf::TextFormat::PrintToString(np, &s);
+    np.mutable_layer()->ReleaseLast();
+    os << '\'' << s << '\'';
   }
 
   // override set_default
diff --git a/plugin/caffe/caffe_loss-inl.h b/plugin/caffe/caffe_loss-inl.h
index 802e3923ab64..038ee1458bcd 100644
--- a/plugin/caffe/caffe_loss-inl.h
+++ b/plugin/caffe/caffe_loss-inl.h
@@ -2,7 +2,7 @@
  * Copyright (c) 2016 by Contributors
  * \file caffe_loss-inl.h
  * \brief Caffe Operator
- * \author Haoran Wang 
+ * \author Haoran Wang
 */
 #ifndef PLUGIN_CAFFE_CAFFE_LOSS_INL_H_
 #define PLUGIN_CAFFE_CAFFE_LOSS_INL_H_
@@ -31,7 +31,8 @@ struct CaffeLossParam : public dmlc::Parameter<CaffeLossParam> {
   int num_data, num_out;
   float grad_scale;
 
-  DMLC_DECLARE_PARAMETER(CaffeLossParam) { DMLC_DECLARE_FIELD(prototxt).set_default("layer{}")
+  DMLC_DECLARE_PARAMETER(CaffeLossParam) {
+    DMLC_DECLARE_FIELD(prototxt).set_default("layer{}")
     .describe("Caffe's layer parameter");
     DMLC_DECLARE_FIELD(num_data).set_range(0, 100).set_default(2)
     .describe("Operator input number");
@@ -100,9 +101,9 @@ class CaffeLoss : public Operator {
                                       param_.num_out);
     CaffeOpSetup();
     if (ctx.is_train)
-      caffeOp_->SetPhase(::caffe::TRAIN);
+      MXCAFFELAYER(caffeOp_, Dtype)->SetPhase(::caffe::TRAIN);
     else
-      caffeOp_->SetPhase(::caffe::TEST);
+      MXCAFFELAYER(caffeOp_, Dtype)->SetPhase(::caffe::TEST);
     caffeOp_->Forward(bot_, top_);
 
 #if defined(__CUDACC__)
@@ -150,7 +151,7 @@ class CaffeLoss : public Operator {
                                       in_grad.begin(),
                                       param_.num_data);
     // Pass grad scale to caffe blob
-    top_[0]->set_cpu_diff(&grad_scale_);
+    MXCAFFEBLOB(top_[0], Dtype)->set_cpu_diff(&grad_scale_);
 
     // Set BP flag
     for (int i = 0; i < param_.num_data; ++i)
diff --git a/plugin/caffe/caffe_op-inl.h b/plugin/caffe/caffe_op-inl.h
index a30f224160ed..1950865b76c3 100644
--- a/plugin/caffe/caffe_op-inl.h
+++ b/plugin/caffe/caffe_op-inl.h
@@ -2,7 +2,7 @@
  * Copyright (c) 2016 by Contributors
  * \file caffe_op-inl.h
  * \brief Caffe Operator
- * \author Haoran Wang 
+ * \author Haoran Wang
 */
 #ifndef PLUGIN_CAFFE_CAFFE_OP_INL_H_
 #define PLUGIN_CAFFE_CAFFE_OP_INL_H_
@@ -111,9 +111,9 @@ class CaffeOp : public Operator {
       caffe::SetOpBlobs(caffeOp_, wei_);
     }
     if (ctx.is_train)
-      caffeOp_->SetPhase(::caffe::TRAIN);
+      MXCAFFELAYER(caffeOp_, Dtype)->SetPhase(::caffe::TRAIN);
     else
-      caffeOp_->SetPhase(::caffe::TEST);
+      MXCAFFELAYER(caffeOp_, Dtype)->SetPhase(::caffe::TEST);
     caffeOp_->Forward(bot_, top_);
 
 #if defined(__CUDACC__)
@@ -221,7 +221,7 @@ class CaffeOpProp : public OperatorProperty {
   std::vector<std::string> ListArguments() const override {
     std::vector<std::string> res;
     for (int i = 0; i < param_.num_data; ++i)
-      res.push_back(std::string("data_") + static_cast<char>('0' + i));
+      res.push_back(std::string("data_") + std::to_string(i));
 
     for (int i = 0; i < param_.num_weight; ++i) {
       if (i == 0)
diff --git a/prepare_mkl.sh b/prepare_mkl.sh
index 2bd52fb45ffe..ac799b5cd74c 100755
--- a/prepare_mkl.sh
+++ b/prepare_mkl.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
 # set -ex
-# 
+#
 # All modification made by Intel Corporation: © 2016 Intel Corporation
-# 
+#
 # All contributions by the University of California:
 # Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
 # All rights reserved.
-# 
+#
 # All other contributions:
 # Copyright (c) 2014, 2015, the respective contributors
 # All rights reserved.
 # For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md
-# 
-# 
+#
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-# 
+#
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
@@ -24,7 +24,7 @@
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
-# 
+#
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -35,7 +35,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# 
+#
 GetVersionName()
 {
 VERSION_LINE=0
@@ -55,6 +55,8 @@ if [ ! -d "$HOME_MKL" ]; then
 fi
 MXNET_ROOT=`dirname $0`
 USE_MKLML=0
+# NOTE: if you update the following line, please also update the dockerfile at
+# tests/ci_build/Dockerfile.mkl
 VERSION_MATCH=20170210
 ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz
 MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
@@ -70,7 +72,7 @@ if [ -z $MKLROOT ]; then
     #echo $VERSION_LINE
     if [ $VERSION_LINE -lt $VERSION_MATCH ] ; then
       #...If it is not then downloaded and unpacked
-      wget --no-check-certificate -P $MXNET_ROOT $MKLURL -O $MXNET_ROOT/$ARCHIVE_BASENAME
+      wget --quiet --no-check-certificate -P $MXNET_ROOT $MKLURL -O $MXNET_ROOT/$ARCHIVE_BASENAME
       tar -xzf $MXNET_ROOT/$ARCHIVE_BASENAME -C $MXNET_ROOT
       #echo $HOME_MKL
       yes | cp -rf $MXNET_ROOT/$MKL_CONTENT_DIR/* $HOME_MKL
diff --git a/ps-lite b/ps-lite
index 5ac4af05e504..acdb698fa3bb 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit 5ac4af05e5048652154b80ca8f12b3d2751b50fa
+Subproject commit acdb698fa3bb80929ef83bb37c705f025e119b82
diff --git a/python/README.md b/python/README.md
index f289c98b7155..b3de895a5f62 100644
--- a/python/README.md
+++ b/python/README.md
@@ -7,3 +7,16 @@ It allows you to mix the flavours of deep learning programs together to maximize
 Installation
 ------------
 To install, check [Build Instruction](http://mxnet.io/get_started/setup.html)
+
+
+Running the tests
+-----------------
+
+For running the tests you can do the following in the parent directory.
+
+```
+nosetests tests/python/unittest
+nosetests tests/python/train
+
+```
+
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index de61de9f303d..5e27cd5b70d5 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 # coding: utf-8
-"""MXNet: a concise, fast and flexible framework for deep learning. """
+"""MXNet: a concise, fast and flexible framework for deep learning."""
 from __future__ import absolute_import
 
 from .context import Context, current_context, cpu, gpu
 from .base import MXNetError
 from . import base
+from . import contrib
 from . import ndarray
 from . import name
 # use mx.sym as short for symbol
@@ -46,6 +47,7 @@
 from . import torch as th
 
 from . import profiler
+from . import log
 
 from . import module
 from . import module as mod
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index f18c45a496db..f7de710d4400 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -73,7 +73,9 @@ def _make_ndarray_function(handle, name):
     arguments = []
     for i in range(num_args.value):
         dtype = py_str(arg_types[i])
-        if not (dtype.startswith('NDArray') or dtype.startswith('Symbol')):
+        if not (dtype.startswith('NDArray') or
+                dtype.startswith('Symbol') or
+                dtype.startswith('ndarray-or-symbol')):
             arguments.append(py_str(arg_names[i]))
 
     # Definition of internal functions.
@@ -164,11 +166,16 @@ def _init_ndarray_module(ndarray_class, root_namespace):
 
     module_obj = _sys.modules["%s.ndarray" % root_namespace]
     module_internal = _sys.modules["%s._ndarray_internal" % root_namespace]
+    module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace]
     for name in op_names:
         hdl = OpHandle()
         check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
         function = _make_ndarray_function(hdl, name)
-        if function.__name__.startswith('_'):
+        if function.__name__.startswith('_contrib_'):
+            function.__name__ = function.__name__[9:]
+            function.__module__ = 'mxnet.contrib.ndarray'
+            setattr(module_contrib, function.__name__, function)
+        elif function.__name__.startswith('_'):
             setattr(module_internal, function.__name__, function)
         else:
             setattr(module_obj, function.__name__, function)
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index d9bb4a8cc3fa..00d935d4b0be 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -211,11 +211,16 @@ def _init_symbol_module(symbol_class, root_namespace):
 
     module_obj = sys.modules["%s.symbol" % root_namespace]
     module_internal = sys.modules["%s._symbol_internal" % root_namespace]
+    module_contrib = sys.modules["%s.contrib.symbol" % root_namespace]
     for name in op_names:
         hdl = OpHandle()
         check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
         function = _make_atomic_symbol_function(hdl, name)
-        if function.__name__.startswith('_'):
+        if function.__name__.startswith('_contrib_'):
+            function.__name__ = function.__name__[9:]
+            function.__module__ = 'mxnet.contrib.symbol'
+            setattr(module_contrib, function.__name__, function)
+        elif function.__name__.startswith('_'):
             setattr(module_internal, function.__name__, function)
         else:
             setattr(module_obj, function.__name__, function)
diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/_ndarray_internal.py
index cbe2bcd96220..52ec16df4a8a 100644
--- a/python/mxnet/_ndarray_internal.py
+++ b/python/mxnet/_ndarray_internal.py
@@ -1 +1 @@
-"""NDArray namespace used to register internal functions"""
+"""NDArray namespace used to register internal functions."""
diff --git a/python/mxnet/_symbol_internal.py b/python/mxnet/_symbol_internal.py
index d798f8d3704a..58a8e4b2658a 100644
--- a/python/mxnet/_symbol_internal.py
+++ b/python/mxnet/_symbol_internal.py
@@ -1 +1 @@
-"""Symbol namespace used to register internal functions"""
+"""Symbol namespace used to register internal functions."""
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index e2f997d227a6..bce7dea10d01 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -1,15 +1,16 @@
 # coding: utf-8
 # pylint: disable=invalid-name, no-member
-""" ctypes library of mxnet and helper functions """
+"""ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
 import sys
 import ctypes
 import atexit
 import warnings
-warnings.filterwarnings('default', category=DeprecationWarning)
+import inspect
 import numpy as np
 from . import libinfo
+warnings.filterwarnings('default', category=DeprecationWarning)
 
 __all__ = ['MXNetError']
 #----------------------------
@@ -28,7 +29,7 @@
 
 
 class MXNetError(Exception):
-    """Error that will be throwed by all mxnet functions"""
+    """Error that will be throwed by all mxnet functions."""
     pass
 
 def _load_lib():
@@ -63,66 +64,66 @@ def _load_lib():
 # helper function definition
 #----------------------------
 def check_call(ret):
-    """Check the return value of C API call
+    """Check the return value of C API call.
 
-    This function will raise exception when error occurs.
-    Wrap every API call with this function
+    This function will raise an exception when an error occurs.
+    Wrap every API call with this function.
 
     Parameters
     ----------
     ret : int
-        return value from API calls
+        return value from API calls.
     """
     if ret != 0:
         raise MXNetError(py_str(_LIB.MXGetLastError()))
 
 if sys.version_info[0] < 3:
     def c_str(string):
-        """Create ctypes char * from a python string
+        """Create ctypes char * from a Python string.
 
         Parameters
         ----------
         string : string type
-            python string
+            Python string.
 
         Returns
         -------
         str : c_char_p
-            A char pointer that can be passed to C API
+            A char pointer that can be passed to C API.
         """
         return ctypes.c_char_p(string)
 else:
     def c_str(string):
-        """Create ctypes char * from a python string
+        """Create ctypes char * from a Python string.
 
         Parameters
         ----------
         string : string type
-            python string
+            Python string.
 
         Returns
         -------
         str : c_char_p
-            A char pointer that can be passed to C API
+            A char pointer that can be passed to C API.
         """
         return ctypes.c_char_p(string.encode('utf-8'))
 
 
 def c_array(ctype, values):
-    """Create ctypes array from a python array
+    """Create ctypes array from a Python array.
 
     Parameters
     ----------
     ctype : ctypes data type
-        data type of the array we want to convert to
+        Data type of the array we want to convert to.
 
     values : tuple or list
-        data content
+        Data content.
 
     Returns
     -------
     out : ctypes array
-        Created ctypes array
+        Created ctypes array.
     """
     return (ctype * len(values))(*values)
 
@@ -132,14 +133,14 @@ def ctypes2buffer(cptr, length):
     Parameters
     ----------
     cptr : ctypes.POINTER(ctypes.c_char)
-        pointer to the raw memory region
+        Pointer to the raw memory region.
     length : int
-        the length of the buffer
+        The length of the buffer.
 
     Returns
     -------
     buffer : bytearray
-        The raw byte memory buffer
+        The raw byte memory buffer.
     """
     if not isinstance(cptr, ctypes.POINTER(ctypes.c_char)):
         raise TypeError('expected char pointer')
@@ -150,9 +151,9 @@ def ctypes2buffer(cptr, length):
     return res
 
 def ctypes2numpy_shared(cptr, shape):
-    """Convert a ctypes pointer to a numpy array
+    """Convert a ctypes pointer to a numpy array.
 
-    The result numpy array shares the memory with the pointer
+    The resulting NumPy array shares the memory with the pointer.
 
     Parameters
     ----------
@@ -160,12 +161,12 @@ def ctypes2numpy_shared(cptr, shape):
         pointer to the memory region
 
     shape : tuple
-        shape of target ndarray
+        Shape of target NDArray.
 
     Returns
     -------
     out : numpy_array
-        A numpy array : numpy array
+        A numpy array : numpy array.
     """
     if not isinstance(cptr, ctypes.POINTER(mx_float)):
         raise RuntimeError('expected float pointer')
@@ -201,6 +202,8 @@ def build_param_doc(arg_names, arg_types, arg_descs, remove_dup=True):
     for key, type_info, desc in zip(arg_names, arg_types, arg_descs):
         if key in param_keys and remove_dup:
             continue
+        if key == 'num_args':
+            continue
         param_keys.add(key)
         ret = '%s : %s' % (key, type_info)
         if len(desc) != 0:
@@ -218,3 +221,38 @@ def _notify_shutdown():
     check_call(_LIB.MXNotifyShutdown())
 
 atexit.register(_notify_shutdown)
+
+def add_fileline_to_docstring(module, incursive=True):
+    """Append the definition position to each function contained in module.
+
+    Examples
+    --------
+    # Put the following codes at the end of a file
+    add_fileline_to_docstring(__name__)
+    """
+
+    def _add_fileline(obj):
+        """Add fileinto to a object.
+        """
+        if obj.__doc__ is None or 'From:' in obj.__doc__:
+            return
+        fname = inspect.getsourcefile(obj)
+        if fname is None:
+            return
+        try:
+            line = inspect.getsourcelines(obj)[-1]
+        except IOError:
+            return
+        obj.__doc__ += '\n\nFrom:%s:%d' % (fname, line)
+
+    if isinstance(module, str):
+        module = sys.modules[module]
+    for _, obj in inspect.getmembers(module):
+        if inspect.isbuiltin(obj):
+            continue
+        if inspect.isfunction(obj):
+            _add_fileline(obj)
+        if inspect.ismethod(obj):
+            _add_fileline(obj.__func__)
+        if inspect.isclass(obj) and incursive:
+            add_fileline_to_docstring(obj, False)
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index a1ef70ff403a..396f5a1357a1 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -16,11 +16,11 @@ def module_checkpoint(mod, prefix, period=1, save_optimizer_states=False):
     mod : subclass of BaseModule
         The module to checkpoint.
     prefix : str
-        The file prefix to checkpoint to
+        The file prefix for this checkpoint.
     period : int
-        How many epochs to wait before checkpointing. Default is 1.
+        How many epochs to wait before checkpointing. Defaults to 1.
     save_optimizer_states : bool
-        Whether to save optimizer states for continue training
+        Indicates whether or not to save optimizer states for continued training.
 
     Returns
     -------
@@ -42,14 +42,14 @@ def do_checkpoint(prefix, period=1):
     Parameters
     ----------
     prefix : str
-        The file prefix to checkpoint to
+        The file prefix for this checkpoint.
     period : int
-    	How many epochs to wait before checkpointing. Default is 1.
+    	How many epochs to wait before checkpointing. Defaults to 1.
 
     Returns
     -------
     callback : function
-        The callback function that can be passed as iter_end_callback to fit.
+        The callback function that can be passed as ``iter_end_callback`` to fit.
     """
     period = int(max(1, period))
     def _callback(iter_no, sym, arg, aux):
@@ -67,7 +67,7 @@ def log_train_metric(period, auto_reset=False):
     period : int
         The number of batch to log the training evaluation metric.
     auto_reset : bool
-        Reset the metric after each log
+        Reset the metric after each log.
 
     Returns
     -------
@@ -92,7 +92,7 @@ class Speedometer(object):
     Parameters
     ----------
     batch_size: int
-        batch_size of data
+        batch_size of data.
     frequent: int
         How many batches between calculations.
         Defaults to calculating & logging every 50 batches.
@@ -153,8 +153,7 @@ def __call__(self, param):
 
 
 class LogValidationMetricsCallback(object):
-    """Just logs the eval metrics at the end of an epoch.
-    """
+    """Just logs the eval metrics at the end of an epoch."""
 
     def __call__(self, param):
         if not param.eval_metric:
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 0f77b0a2007a..0ef7bd4bfdcd 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -8,10 +8,10 @@ class Context(object):
     Parameters
     ----------
     device_type : {'cpu', 'gpu'} or Context.
-        String representing the device type
+        String representing the device type.
 
     device_id : int (default=0)
-        The device id of the device, needed for GPU
+        The device id of the device, needed for GPU.
 
     Note
     ----
@@ -82,12 +82,12 @@ def __exit__(self, ptype, value, trace):
 def cpu(device_id=0):
     """Return a CPU context.
 
-    This function is a short cut for Context('cpu', device_id)
+    This function is a short cut for ``Context('cpu', device_id)``.
 
     Parameters
     ----------
     device_id : int, optional
-        The device id of the device. device_id is not needed for CPU.
+        The device id of the device. ``device_id`` is not needed for CPU.
         This is included to make interface compatible with GPU.
 
     Returns
@@ -101,12 +101,12 @@ def cpu(device_id=0):
 def gpu(device_id=0):
     """Return a GPU context.
 
-    This function is a short cut for Context('gpu', device_id)
+    This function is a short cut for Context('gpu', device_id).
 
     Parameters
     ----------
     device_id : int, optional
-        The device id of the device, needed for GPU
+        The device id of the device, needed for GPU.
 
     Returns
     -------
diff --git a/python/mxnet/contrib/__init__.py b/python/mxnet/contrib/__init__.py
new file mode 100644
index 000000000000..591991819ac5
--- /dev/null
+++ b/python/mxnet/contrib/__init__.py
@@ -0,0 +1,10 @@
+# coding: utf-8
+"""Experimental contributions"""
+
+from . import symbol
+from . import ndarray
+
+from . import symbol as sym
+from . import ndarray as nd
+
+from . import tensorboard
diff --git a/python/mxnet/contrib/ndarray.py b/python/mxnet/contrib/ndarray.py
new file mode 100644
index 000000000000..cf1815c99434
--- /dev/null
+++ b/python/mxnet/contrib/ndarray.py
@@ -0,0 +1,2 @@
+# coding: utf-8
+"""NDArray namespace used to register contrib functions"""
diff --git a/python/mxnet/contrib/symbol.py b/python/mxnet/contrib/symbol.py
new file mode 100644
index 000000000000..81c5ce889331
--- /dev/null
+++ b/python/mxnet/contrib/symbol.py
@@ -0,0 +1,2 @@
+# coding: utf-8
+"""Symbol namespace used to register contrib functions"""
diff --git a/python/mxnet/contrib/tensorboard.py b/python/mxnet/contrib/tensorboard.py
new file mode 100644
index 000000000000..5bcc3440842c
--- /dev/null
+++ b/python/mxnet/contrib/tensorboard.py
@@ -0,0 +1,56 @@
+# coding: utf-8
+"""TensorBoard functions that can be used to log various status during epoch."""
+from __future__ import absolute_import
+
+import logging
+
+
+class LogMetricsCallback(object):
+    """Log metrics periodically in TensorBoard.
+    This callback works almost same as `callback.Speedometer`, but write TensorBoard event file
+    for visualization. For more usage, please refer https://github.com/dmlc/tensorboard
+
+    Parameters
+    ----------
+    logging_dir : str
+        TensorBoard event file directory.
+        After that, use `tensorboard --logdir=path/to/logs` to launch TensorBoard visualization.
+    prefix : str
+        Prefix for a metric name of `scalar` value.
+        You might want to use this param to leverage TensorBoard plot feature,
+        where TensorBoard plots different curves in one graph when they have same `name`.
+        The follow example shows the usage(how to compare a train and eval metric in a same graph).
+
+    Examples
+    --------
+    >>> # log train and eval metrics under different directories.
+    >>> training_log = 'logs/train'
+    >>> evaluation_log = 'logs/eval'
+    >>> # in this case, each training and evaluation metric pairs has same name,
+    >>> # you can add a prefix to make it separate.
+    >>> batch_end_callbacks = [mx.tensorboard.LogMetricsCallback(training_log)]
+    >>> eval_end_callbacks = [mx.tensorboard.LogMetricsCallback(evaluation_log)]
+    >>> # run
+    >>> model.fit(train,
+    >>>     ...
+    >>>     batch_end_callback = batch_end_callbacks,
+    >>>     eval_end_callback  = eval_end_callbacks)
+    >>> # Then use `tensorboard --logdir=logs/` to launch TensorBoard visualization.
+    """
+    def __init__(self, logging_dir, prefix=None):
+        self.prefix = prefix
+        try:
+            from tensorboard import SummaryWriter
+            self.summary_writer = SummaryWriter(logging_dir)
+        except ImportError:
+            logging.error('You can install tensorboard via `pip install tensorboard`.')
+
+    def __call__(self, param):
+        """Callback to log training speed and metrics in TensorBoard."""
+        if param.eval_metric is None:
+            return
+        name_value = param.eval_metric.get_name_value()
+        for name, value in name_value:
+            if self.prefix is not None:
+                name = '%s-%s' % (self.prefix, name)
+            self.summary_writer.add_scalar(name, value)
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index ed0c3ec89010..81d9295f2f18 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -5,6 +5,7 @@
 
 import ctypes
 import copy
+import warnings
 import numpy as np
 from .base import _LIB
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
@@ -18,25 +19,25 @@
 from .executor_manager import _split_input_slice, _check_arguments, _load_data, _load_label
 
 def _monitor_callback_wrapper(callback):
-    """ a wrapper for the user-defined handle """
+    """A wrapper for the user-defined handle."""
     def callback_handle(name, array, _):
         """ ctypes function """
         callback(name, array)
     return callback_handle
 
 class Executor(object):
-    """ Executor is the actual executing object of MXNet."""
+    """Executor is the actual executing object of MXNet."""
     def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
         """Constructor, used Symbol.bind and Symbol.simple_bind instead.
 
         Parameters
         ----------
         handle: ExecutorHandle
-            ExecutorHandle generated by calling Bind
+            ExecutorHandle generated by calling Bind.
 
         See Also
         --------
-        Symbol.bind : to create executor
+        Symbol.bind : to create executor.
         """
         if not isinstance(handle, ExecutorHandle):
             raise TypeError("Handle type error")
@@ -51,6 +52,7 @@ def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
         self._aux_dict = None
         self._output_dict = None
         self._monitor_callback = None
+        self._output_dirty = False
         self._ctx = copy.deepcopy(ctx)
         self._grad_req = copy.deepcopy(grad_req)
         self._group2ctx = copy.deepcopy(group2ctx)
@@ -69,7 +71,7 @@ def _get_dict(names, ndarrays):
         return dict(zip(names, ndarrays))
 
     def _get_outputs(self):
-        """list all the output ndarray
+        """List all the output NDArray.
 
         Returns
         -------
@@ -87,7 +89,9 @@ def forward(self, is_train=False, **kwargs):
         Parameters
         ----------
         is_train: bool, optional
-            whether this forward is for evaluation purpose.
+            Whether this forward is for evaluation purpose. If True,
+            a backward call is expected to follow. Otherwise following
+            backward is invalid.
 
         **kwargs
             Additional specification of input arguments.
@@ -118,6 +122,13 @@ def forward(self, is_train=False, **kwargs):
         check_call(_LIB.MXExecutorForward(
             self.handle,
             ctypes.c_int(int(is_train))))
+
+        if self._output_dirty:
+            warnings.warn(
+                "Calling forward the second time after forward(is_train=True) "
+                "without calling backward first. Is this intended?", stacklevel=2)
+        self._output_dirty = is_train
+
         return self.outputs
 
     def backward(self, out_grads=None):
@@ -146,6 +157,12 @@ def backward(self, out_grads=None):
             mx_uint(len(out_grads)),
             ndarray))
 
+        if not self._output_dirty:
+            warnings.warn(
+                "Calling backward without calling forward(is_train=True) "
+                "first. Behavior is undefined.", stacklevel=2)
+        self._output_dirty = False
+
     def set_monitor_callback(self, callback):
         """Install callback.
 
@@ -168,7 +185,7 @@ def arg_dict(self):
         Returns
         -------
         arg_dict : dict of str to NDArray
-            The dictionary that maps name of arguments to NDArrays.
+            The dictionary that maps the names of arguments to NDArrays.
 
         Raises
         ------
@@ -235,37 +252,37 @@ def copy_params_from(self, arg_params, aux_params=None, allow_extra_params=False
         Parameters
         ----------
         arg_params : dict of str to NDArray
-            Parameters, dict of name to NDArray of arguments
+            Parameters, dict of name to NDArray of arguments.
 
         aux_params : dict of str to NDArray, optional
             Parameters, dict of name to NDArray of auxiliary states.
 
         allow_extra_params : boolean, optional
-            Whether allow extra parameters that are not needed by symbol
+            Whether allow extra parameters that are not needed by symbol.
             If this is True, no error will be thrown when arg_params or aux_params
             contain extra parameters that is not needed by the executor.
 
         Raises
         ------
         ValueError
-            If there is additional parameters in the dict but allow_extra_params=False
+            If there is additional parameters in the dict but ``allow_extra_params=False``.
         """
         for name, array in arg_params.items():
             if name in self.arg_dict:
                 dst = self.arg_dict[name]
                 array.astype(dst.dtype).copyto(dst)
-            else:
-                if not allow_extra_params:
-                    raise ValueError('Find name \"%s\" that is not in the arguments' % name)
+            elif not allow_extra_params:
+                raise ValueError('Find name \"%s\" that is not in the arguments' % name)
+
         if aux_params is None:
-            aux_params = {}
+            return
+
         for name, array in aux_params.items():
             if name in self.aux_dict:
                 dst = self.aux_dict[name]
                 array.astype(dst.dtype).copyto(dst)
-            else:
-                if not allow_extra_params:
-                    raise ValueError('Find name %s that is not in the auxiliary states' % name)
+            elif not allow_extra_params:
+                raise ValueError('Find name %s that is not in the auxiliary states' % name)
 
     def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
         """Return a new executor with the same symbol and shared memory,
@@ -281,7 +298,7 @@ def reshape(self, partial_shaping=False, allow_up_sizing=False, **kwargs):
         allow_up_sizing : bool
             Whether to allow allocating new ndarrays that's larger than the original.
         kwargs : dict of string to tuple of int
-            new shape for arguments.
+            New shape for arguments.
         Returns
         -------
         exec : Executor
diff --git a/python/mxnet/executor_manager.py b/python/mxnet/executor_manager.py
index 2bd2df884870..bfbc9b000949 100644
--- a/python/mxnet/executor_manager.py
+++ b/python/mxnet/executor_manager.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 # pylint: disable=invalid-name, protected-access, too-many-locals, too-many-arguments, too-many-statements
-"""Executor manager"""
+"""Executor manager."""
 from __future__ import absolute_import
 
 import logging
@@ -21,7 +21,7 @@ def _split_input_slice(batch_size, work_load_list):
         The number of samples in a mini-batch.
     work_load_list : list of float or int, optional
         The list of work load for different devices,
-        in the same order as ctx
+        in the same order as ctx.
 
     Returns
     -------
@@ -57,7 +57,7 @@ def _check_arguments(symbol):
     Parameters
     ----------
     symbol : Symbol
-        The network configuration
+        The network configuration.
     """
     arg_set = set()
     arg_names = symbol.list_arguments()
@@ -80,7 +80,7 @@ def _check_arguments(symbol):
         aux_set.add(name)
 
 def _load_general(data, targets):
-    """Load a list of arrays into a list of arrays specified by slices"""
+    """Load a list of arrays into a list of arrays specified by slices."""
     for d_src, d_targets in zip(data, targets):
         if isinstance(d_targets, nd.NDArray):
             d_src.copyto(d_targets)
@@ -92,11 +92,11 @@ def _load_general(data, targets):
                 d_src[slice_idx].copyto(d_dst)
 
 def _load_data(batch, targets):
-    """Load data into sliced arrays"""
+    """Load data into sliced arrays."""
     _load_general(batch.data, targets)
 
 def _load_label(batch, targets):
-    """Load label into sliced arrays"""
+    """Load label into sliced arrays."""
     _load_general(batch.label, targets)
 
 # pylint: disable=too-many-branches
@@ -197,7 +197,7 @@ class DataParallelExecutorGroup(object):
     param_names: list of str
         List of names of all trainable parameters.
     ctx: list of Context
-        List of devices for training (data parallelization)
+        List of devices for training (data parallelization).
     slices: list of int
         Describes how the data parallelization splits data into different devices.
     train_data: DataIter (or DataBatch)
@@ -256,22 +256,22 @@ def __init__(self, sym, arg_names, param_names, ctx, slices, train_data, shared_
         self.slices = slices
 
     def load_data_batch(self, data_batch):
-        """ load data and labels into arrays """
+        """Load data and labels into arrays."""
         _load_data(data_batch, self.data_arrays)
         _load_label(data_batch, self.label_arrays)
 
     def forward(self, is_train=False):
-        """ Perform a forward pass on each executor """
+        """Perform a forward pass on each executor."""
         for texec in self.train_execs:
             texec.forward(is_train=is_train)
 
     def backward(self):
-        """ Perform a backward pass on each executor """
+        """Perform a backward pass on each executor."""
         for texec in self.train_execs:
             texec.backward()
 
     def update_metric(self, metric, labels):
-        """ Update evaluation metric with label and current outputs """
+        """Update evaluation metric with label and current outputs."""
         for texec, islice in zip(self.train_execs, self.slices):
             labels_slice = [label[islice] for label in labels]
             metric.update(labels_slice, texec.outputs)
@@ -282,9 +282,9 @@ class DataParallelExecutorManager(object):
     Parameters
     ----------
     symbol : Symbol
-        output symbol
+        Output symbol.
     ctx : list of Context
-        devices to run on
+        Devices to run on.
     param_names: list of str
         Name of all trainable parameters of the network.
     arg_names: list of str
@@ -295,10 +295,10 @@ class DataParallelExecutorManager(object):
         Training data iterator.
     work_load_list : list of float or int, optional
         The list of work load for different devices,
-        in the same order as ctx
+        in the same order as ctx.
     logger : logging logger
         When not specified, default logger will be used.
-    sym_gen : a function that generate new Symbols depending on different
+    sym_gen : A function that generate new Symbols depending on different
         input shapes. Used only for bucketing.
     """
     def __init__(self, symbol, ctx, train_data,
@@ -334,7 +334,7 @@ def __init__(self, symbol, ctx, train_data,
 
 
     def install_monitor(self, monitor):
-        """ Install monitor on all executors """
+        """Install monitor on all executors."""
         if self.sym_gen is not None:
             raise NotImplementedError("Monitoring is not implemented for bucketing")
 
@@ -342,28 +342,28 @@ def install_monitor(self, monitor):
             monitor.install(train_exec)
 
     def set_params(self, arg_params, aux_params):
-        """ set parameter and aux values
+        """Set parameter and aux values.
 
         Parameters
         ----------
         arg_params : list of NDArray
-            source parameter arrays
+            Source parameter arrays
         aux_params : list of NDArray
-            source aux arrays
+            Source aux arrays.
         """
 
         for texec in self.execgrp.train_execs:
             texec.copy_params_from(arg_params, aux_params)
 
     def copy_to(self, arg_params, aux_params):
-        """ Copy data from each executor to `arg_params` and `aux_params`
+        """ Copy data from each executor to ```arg_params`` and ``aux_params``.
 
         Parameters
         ----------
         arg_params : list of NDArray
-            target parameter arrays
+            Target parameter arrays.
         aux_params : list of NDArray
-            target aux arrays
+            Target aux arrays.
 
         Notes
         -----
@@ -378,23 +378,23 @@ def copy_to(self, arg_params, aux_params):
 
     @property
     def param_arrays(self):
-        """shared parameter arrays"""
+        """Shared parameter arrays."""
         # param arrays should be shared by all executor groups
         return self.execgrp.param_arrays
     @property
     def grad_arrays(self):
-        """shared gradient arrays"""
+        """Shared gradient arrays."""
         # grad arrays should be shared by all executor groups
         return self.execgrp.grad_arrays
 
     @property
     def aux_arrays(self):
-        """shared aux states"""
+        """Shared aux states."""
         # aux arrays are also shared by all executor groups
         return self.execgrp.aux_arrays
 
     def load_data_batch(self, data_batch):
-        """ load data and labels into arrays """
+        """Load data and labels into arrays."""
         if self.sym_gen is not None:
             key = data_batch.bucket_key
             if key not in self.execgrp_bucket:
@@ -413,13 +413,13 @@ def load_data_batch(self, data_batch):
         self.curr_execgrp.load_data_batch(data_batch)
 
     def forward(self, is_train=False):
-        """run forward on the current executor"""
+        """Run forward on the current executor."""
         self.curr_execgrp.forward(is_train=is_train)
 
     def backward(self):
-        """run backward on the current executor"""
+        """Run backward on the current executor."""
         self.curr_execgrp.backward()
 
     def update_metric(self, metric, labels):
-        """update metric with the current executor"""
+        """Update metric with the current executor."""
         self.curr_execgrp.update_metric(metric, labels)
diff --git a/python/mxnet/image.py b/python/mxnet/image.py
index 0e55f9cda817..4b70c38df32f 100644
--- a/python/mxnet/image.py
+++ b/python/mxnet/image.py
@@ -1,9 +1,8 @@
-# coding: utf-8
 # pylint: disable=no-member, too-many-lines, redefined-builtin, protected-access, unused-import, invalid-name
 # pylint: disable=too-many-arguments, too-many-locals, no-name-in-module, too-many-branches, too-many-statements
-"""Image IO API of mxnet."""
+"""Read invidual image files and perform augmentations."""
+
 from __future__ import absolute_import, print_function
-from .base import numeric_types
 
 import os
 import random
@@ -15,6 +14,7 @@
 except ImportError:
     cv2 = None
 
+from .base import numeric_types
 from . import ndarray as nd
 from . import _ndarray_internal as _internal
 from ._ndarray_internal import _cvimresize as imresize
@@ -42,7 +42,7 @@ def imdecode(buf, **kwargs):
     return _internal._cvimdecode(buf, **kwargs)
 
 def scale_down(src_size, size):
-    """Scale down crop size if it's bigger than image size"""
+    """Scale down crop size if it's bigger than image size."""
     w, h = size
     sw, sh = src_size
     if sh < h:
@@ -52,7 +52,7 @@ def scale_down(src_size, size):
     return int(w), int(h)
 
 def resize_short(src, size, interp=2):
-    """Resize shorter edge to size"""
+    """Resize shorter edge to size."""
     h, w, _ = src.shape
     if h > w:
         new_h, new_w = size*h/w, size
@@ -61,14 +61,14 @@ def resize_short(src, size, interp=2):
     return imresize(src, new_w, new_h, interp=interp)
 
 def fixed_crop(src, x0, y0, w, h, size=None, interp=2):
-    """Crop src at fixed location, and (optionally) resize it to size"""
+    """Crop src at fixed location, and (optionally) resize it to size."""
     out = nd.crop(src, begin=(y0, x0, 0), end=(y0+h, x0+w, int(src.shape[2])))
     if size is not None and (w, h) != size:
         out = imresize(out, *size, interp=interp)
     return out
 
 def random_crop(src, size, interp=2):
-    """Randomly crop src with size. Upsample result if src is smaller than size"""
+    """Randomly crop src with size. Upsample result if src is smaller than size."""
     h, w, _ = src.shape
     new_w, new_h = scale_down((w, h), size)
 
@@ -79,7 +79,7 @@ def random_crop(src, size, interp=2):
     return out, (x0, y0, new_w, new_h)
 
 def center_crop(src, size, interp=2):
-    """Randomly crop src with size. Upsample result if src is smaller than size"""
+    """Randomly crop src with size. Upsample result if src is smaller than size."""
     h, w, _ = src.shape
     new_w, new_h = scale_down((w, h), size)
 
@@ -90,14 +90,14 @@ def center_crop(src, size, interp=2):
     return out, (x0, y0, new_w, new_h)
 
 def color_normalize(src, mean, std=None):
-    """Normalize src with mean and std"""
+    """Normalize src with mean and std."""
     src -= mean
     if std is not None:
         src /= std
     return src
 
 def random_size_crop(src, size, min_area, ratio, interp=2):
-    """Randomly crop src with size. Randomize area and aspect ratio"""
+    """Randomly crop src with size. Randomize area and aspect ratio."""
     h, w, _ = src.shape
     new_ratio = random.uniform(*ratio)
     if new_ratio * h > w:
@@ -120,7 +120,7 @@ def random_size_crop(src, size, min_area, ratio, interp=2):
     return out, (x0, y0, new_w, new_h)
 
 def ResizeAug(size, interp=2):
-    """Make resize shorter edge to size augumenter"""
+    """Make resize shorter edge to size augumenter."""
     def aug(src):
         """Augumenter body"""
         return [resize_short(src, size, interp)]
@@ -134,14 +134,14 @@ def aug(src):
     return aug
 
 def RandomSizedCropAug(size, min_area, ratio, interp=2):
-    """Make random crop with random resizing and random aspect ratio jitter augumenter"""
+    """Make random crop with random resizing and random aspect ratio jitter augumenter."""
     def aug(src):
         """Augumenter body"""
         return [random_size_crop(src, size, min_area, ratio, interp)[0]]
     return aug
 
 def CenterCropAug(size, interp=2):
-    """Make center crop augmenter"""
+    """Make center crop augmenter."""
     def aug(src):
         """Augumenter body"""
         return [center_crop(src, size, interp)[0]]
@@ -159,7 +159,7 @@ def aug(src):
     return aug
 
 def ColorJitterAug(brightness, contrast, saturation):
-    """Apply random brightness, contrast and saturation jitter in random order"""
+    """Apply random brightness, contrast and saturation jitter in random order."""
     ts = []
     coef = nd.array([[[0.299, 0.587, 0.114]]])
     if brightness > 0:
@@ -195,7 +195,7 @@ def saug(src):
     return RandomOrderAug(ts)
 
 def LightingAug(alphastd, eigval, eigvec):
-    """Add PCA based noise"""
+    """Add PCA based noise."""
     def aug(src):
         """Augumenter body"""
         alpha = np.random.normal(0, alphastd, size=(3,))
@@ -205,7 +205,7 @@ def aug(src):
     return aug
 
 def ColorNormalizeAug(mean, std):
-    """Mean and std normalization"""
+    """Mean and std normalization."""
     mean = nd.array(mean)
     std = nd.array(std)
     def aug(src):
@@ -214,7 +214,7 @@ def aug(src):
     return aug
 
 def HorizontalFlipAug(p):
-    """Random horizontal flipping"""
+    """Random horizontal flipping."""
     def aug(src):
         """Augumenter body"""
         if random.random() < p:
@@ -233,7 +233,7 @@ def aug(src):
 def CreateAugmenter(data_shape, resize=0, rand_crop=False, rand_resize=False, rand_mirror=False,
                     mean=None, std=None, brightness=0, contrast=0, saturation=0,
                     pca_noise=0, inter_method=2):
-    """Create augumenter list"""
+    """Create augumenter list."""
     auglist = []
 
     if resize > 0:
@@ -286,7 +286,7 @@ class ImageIter(io.DataIter):
     Parameters
     ----------
     batch_size : int
-        Number of examples per batch
+        Number of examples per batch.
     data_shape : tuple
         Data shape in (channels, height, width).
         For now, only RGB image with 3 channels is supported.
@@ -298,10 +298,10 @@ class ImageIter(io.DataIter):
     path_imglist : str
         path to image list (.lst)
         Created with tools/im2rec.py or with custom script.
-        Format: index\t[one or more label separated by \t]\trelative_path_from_root
+        Format: index\t[one or more label separated by \t]\trelative_path_from_root.
     imglist: list
         a list of image with the label(s)
-        each item is a list [imagelabel: float or list of float, imgpath]
+        each item is a list [imagelabel: float or list of float, imgpath].
     path_root : str
         Root folder of image files
     path_imgidx : str
@@ -313,21 +313,27 @@ class ImageIter(io.DataIter):
         Partition index
     num_parts : int
         Total number of partitions.
+    data_name : str
+        data name for provided symbols
+    label_name : str
+        label name for provided symbols
     kwargs : ...
-        More arguments for creating augumenter. See mx.image.CreateAugmenter
+        More arguments for creating augumenter. See mx.image.CreateAugmenter.
     """
+
     def __init__(self, batch_size, data_shape, label_width=1,
                  path_imgrec=None, path_imglist=None, path_root=None, path_imgidx=None,
-                 shuffle=False, part_index=0, num_parts=1, aug_list=None, imglist=None, **kwargs):
+                 shuffle=False, part_index=0, num_parts=1, aug_list=None, imglist=None,
+                 data_name='data', label_name='softmax_label', **kwargs):
         super(ImageIter, self).__init__()
-        assert(path_imgrec or path_imglist or (isinstance(imglist, list)))
+        assert path_imgrec or path_imglist or (isinstance(imglist, list))
         if path_imgrec:
             print('loading recordio...')
             if path_imgidx:
-                self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')
+                self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type
                 self.imgidx = list(self.imgrec.keys)
             else:
-                self.imgrec = recordio.MXRecordIO(path_imgrec, 'r')
+                self.imgrec = recordio.MXRecordIO(path_imgrec, 'r') # pylint: disable=redefined-variable-type
                 self.imgidx = None
         else:
             self.imgrec = None
@@ -350,7 +356,7 @@ def __init__(self, batch_size, data_shape, label_width=1,
             imgkeys = []
             index = 1
             for img in imglist:
-                key = str(index)
+                key = str(index) # pylint: disable=redefined-variable-type
                 index += 1
                 if isinstance(img[0], numeric_types):
                     label = nd.array([img[0]])
@@ -363,12 +369,12 @@ def __init__(self, batch_size, data_shape, label_width=1,
             self.imglist = None
         self.path_root = path_root
 
-        assert len(data_shape) == 3 and data_shape[0] == 3
-        self.provide_data = [('data', (batch_size,) + data_shape)]
+        self.check_data_shape(data_shape)
+        self.provide_data = [(data_name, (batch_size,) + data_shape)]
         if label_width > 1:
-            self.provide_label = [('softmax_label', (batch_size, label_width))]
+            self.provide_label = [(label_name, (batch_size, label_width))]
         else:
-            self.provide_label = [('softmax_label', (batch_size,))]
+            self.provide_label = [(label_name, (batch_size,))]
         self.batch_size = batch_size
         self.data_shape = data_shape
         self.label_width = label_width
@@ -402,7 +408,7 @@ def reset(self):
         self.cur = 0
 
     def next_sample(self):
-        """helper function for reading in next sample"""
+        """Helper function for reading in next sample."""
         if self.seq is not None:
             if self.cur >= len(self.seq):
                 raise StopIteration
@@ -417,10 +423,7 @@ def next_sample(self):
                     return self.imglist[idx][0], img
             else:
                 label, fname = self.imglist[idx]
-                if self.imgrec is None:
-                    with open(os.path.join(self.path_root, fname), 'rb') as fin:
-                        img = fin.read()
-                return label, img
+                return label, self.read_image(fname)
         else:
             s = self.imgrec.read()
             if s is None:
@@ -437,15 +440,16 @@ def next(self):
         try:
             while i < batch_size:
                 label, s = self.next_sample()
-                data = [imdecode(s)]
-                if len(data[0].shape) == 0:
-                    logging.debug('Invalid image, skipping.')
+                data = [self.imdecode(s)]
+                try:
+                    self.check_valid_image(data)
+                except RuntimeError as e:
+                    logging.debug('Invalid image, skipping:  %s', str(e))
                     continue
-                for aug in self.auglist:
-                    data = [ret for src in data for ret in aug(src)]
-                for d in data:
+                data = self.augmentation_transform(data)
+                for datum in data:
                     assert i < batch_size, 'Batch size must be multiples of augmenter output length'
-                    batch_data[i][:] = nd.transpose(d, axes=(2, 0, 1))
+                    batch_data[i][:] = self.postprocess_data(datum)
                     batch_label[i][:] = label
                     i += 1
         except StopIteration:
@@ -453,3 +457,35 @@ def next(self):
                 raise StopIteration
 
         return io.DataBatch([batch_data], [batch_label], batch_size-i)
+
+    def check_data_shape(self, data_shape):
+        """checks that the input data shape is valid"""
+        if not len(data_shape) == 3:
+            raise ValueError('data_shape should have length 3, with dimensions CxHxW')
+        if not data_shape[0] == 3:
+            raise ValueError('This iterator expects inputs to have 3 channels.')
+
+    def check_valid_image(self, data):
+        """checks that data is valid"""
+        if len(data[0].shape) == 0:
+            raise RuntimeError('Data shape is wrong')
+
+    def imdecode(self, s):
+        """decodes a sting or byte string into an image."""
+        return imdecode(s)
+
+    def read_image(self, fname):
+        """reads image from fname and returns the raw bytes to be decoded."""
+        with open(os.path.join(self.path_root, fname), 'rb') as fin:
+            img = fin.read()
+        return img
+
+    def augmentation_transform(self, data):
+        """transforms data with specificied augmentation."""
+        for aug in self.auglist:
+            data = [ret for src in data for ret in aug(src)]
+        return data
+
+    def postprocess_data(self, datum):
+        """final postprocessing step before image is loaded into the batch."""
+        return nd.transpose(datum, axes=(2, 0, 1))
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
old mode 100644
new mode 100755
index 26aa108df395..3fc0a7a5d2af
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -1,6 +1,4 @@
-# coding: utf-8
-# pylint: disable=too-many-branches, too-many-arguments
-"""Initialization helper for mxnet"""
+"""Weight initializer."""
 from __future__ import absolute_import, print_function
 
 import re
@@ -14,24 +12,27 @@
 
 # inherit str for backward compatibility
 class InitDesc(str):
-    """Descriptor for initialization pattern.
+    """Descriptor for the initialization pattern.
 
     Parameter
     ---------
     name : str
-        name of variable
+        Name of variable.
     attrs : dict of str to str
-        attributes of this variable taken from Symbol.attr_dict
+        Attributes of this variable taken from ``Symbol.attr_dict``.
+    global_init : Initializer
+        Global initializer to fallback to.
     """
-    def __new__(cls, name, attrs=None):
+    def __new__(cls, name, attrs=None, global_init=None):
         ret = super(InitDesc, cls).__new__(cls, name)
         ret.attrs = attrs or {}
+        ret.global_init = global_init
         return ret
 
 _INITIALIZER_REGISTRY = {}
 
 def register(klass):
-    """Register optimizers to the optimizer factory"""
+    """Register an intializer to the initializer factory."""
     assert issubclass(klass, Initializer), "Can only register subclass of Initializer"
     name = klass.__name__.lower()
     if name in _INITIALIZER_REGISTRY:
@@ -45,40 +46,35 @@ def register(klass):
     return klass
 
 class Initializer(object):
-    """Base class for Initializer.
-
-    subclasses should call base class with all keyword arguments. For example::
-        @register
-        class Constant(Initializer):
-            def __init__(self, value):
-                super(Constant, self).__init__(value=value)
-    """
+    """The base class of an initializer."""
     def __init__(self, **kwargs):
         self.kwargs = kwargs
 
     def dumps(self):
-        """Save initializer to string"""
+        """Save the initializer to string"""
         return json.dumps([self.__class__.__name__.lower(), self.kwargs])
 
-    # pylint: disable=protected-access
     def __call__(self, desc, arr):
-        """Override () function to do Initialization
+        """Initialize an array
 
         Parameters
         ----------
-        name : InitDesc
-            Initialization pattern Descriptor
+        desc : InitDesc
+            Initialization pattern descriptor.
 
         arr : NDArray
-            ndarray to be Initialized
+            The array to be initialized.
         """
         if not isinstance(desc, InitDesc):
             self._legacy_init(desc, arr)
             return
 
+        if desc.global_init is None:
+            desc.global_init = self
         init = desc.attrs.get('__init__', "")
 
         if init:
+            # when calling Variable initializer
             klass, kwargs = json.loads(init)
             _INITIALIZER_REGISTRY[klass.lower()](**kwargs)._init_weight(desc, arr)
         else:
@@ -101,10 +97,10 @@ def _legacy_init(self, name, arr):
         Parameters
         ----------
         name : str
-            name of corrosponding ndarray
+            Name of corrosponding NDArray.
 
         arr : NDArray
-            ndarray to be Initialized
+            NDArray to be initialized.
         """
         warnings.warn(
             "\033[91mCalling initializer with init(str, NDArray) has been deprecated." \
@@ -139,7 +135,6 @@ def _legacy_init(self, name, arr):
         else:
             self._init_default(name, arr)
 
-    # pylint: disable=no-self-use, missing-docstring, invalid-name
     def _init_bilinear(self, _, arr):
         weight = np.zeros(np.prod(arr.shape), dtype='float32')
         shape = arr.shape
@@ -172,7 +167,7 @@ def _init_beta(self, _, arr):
         arr[:] = 0.0
 
     def _init_weight(self, name, arr):
-        """Abstruct method to Initialize weight"""
+        """Abstract method to Initialize weight."""
         raise NotImplementedError("Must override it")
 
     def _init_default(self, name, _):
@@ -181,20 +176,19 @@ def _init_default(self, name, _):
             'Default initialization is now limited to '\
             '"weight", "bias", "gamma" (1.0), and "beta" (0.0).' \
             'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern' % name)
-    # pylint: enable=no-self-use, missing-docstring, invalid-name
 
 
 class Load(object):
-    """Initialize by loading pretrained param from file or dict
+    """Initialize by loading data from file or dict.
 
     Parameters
     ----------
     param: str or dict of str->NDArray
-        param file or dict mapping name to NDArray.
+        Parameter file or dict mapping name to NDArray.
     default_init: Initializer
-        default initializer when name is not found in param.
+        Default initializer when name is not found in param.
     verbose: bool
-        log source when initializing.
+        Log source when initializing.
     """
     def __init__(self, param, default_init=None, verbose=False):
         if isinstance(param, str):
@@ -228,14 +222,14 @@ def __call__(self, name, arr):
 
 
 class Mixed(object):
-    """Initialize with mixed Initializer
+    """Initialize with multiple initializers.
 
     Parameters
     ----------
     patterns: list of str
-        list of regular expression patterns to match parameter names.
+        List of regular expression patterns to match parameter names.
     initializers: list of Initializer
-        list of Initializer corrosponding to patterns
+        List of Initializer corrosponding to patterns.
     """
     def __init__(self, patterns, initializers):
         assert len(patterns) == len(initializers)
@@ -251,7 +245,7 @@ def __call__(self, name, arr):
 
 @register
 class Zero(Initializer):
-    """Initialize the weight to 0"""
+    """Initialize the weight to 0."""
     def __init__(self):
         super(Zero, self).__init__()
 
@@ -260,7 +254,7 @@ def _init_weight(self, _, arr):
 
 @register
 class One(Initializer):
-    """Initialize the weight to 1"""
+    """Initialize the weight to 1."""
     def __init__(self):
         super(One, self).__init__()
 
@@ -269,7 +263,7 @@ def _init_weight(self, _, arr):
 
 @register
 class Constant(Initializer):
-    """Initialize the weight to 1"""
+    """Initialize the weight to a scalar value."""
     def __init__(self, value):
         super(Constant, self).__init__(value=value)
         self.value = value
@@ -279,12 +273,12 @@ def _init_weight(self, _, arr):
 
 @register
 class Uniform(Initializer):
-    """Initialize the weight with uniform [-scale, scale]
+    """Initialize the weight with value uniformly sampled from ``[-scale, scale]``.
 
     Parameters
     ----------
     scale : float, optional
-        The scale of uniform distribution
+        The scale of uniform distribution.
     """
     def __init__(self, scale=0.07):
         super(Uniform, self).__init__(scale=scale)
@@ -295,7 +289,7 @@ def _init_weight(self, _, arr):
 
 @register
 class Normal(Initializer):
-    """Initialize the weight with normal(0, sigma)
+    """Initialize the weight with value sampled according to ``normal(0, sigma)``.
 
     Parameters
     ----------
@@ -311,27 +305,26 @@ def _init_weight(self, _, arr):
 
 @register
 class Orthogonal(Initializer):
-    """Intialize weight as Orthogonal matrix
+    """Initialize weight as orthogonal matrix.
+
+    This initializer implements *Exact solutions to the nonlinear dynamics of
+    learning in deep linear neural networks*, available at
+    https://arxiv.org/abs/1312.6120.
 
     Parameters
     ----------
     scale : float optional
-        scaling factor of weight
+        Scaling factor of weight.
 
     rand_type: string optional
-        use "uniform" or "normal" random number to initialize weight
+        Use "uniform" or "normal" random number to initialize weight.
 
-    Reference
-    ---------
-    Exact solutions to the nonlinear dynamics of learning in deep linear neural networks
-    arXiv preprint arXiv:1312.6120 (2013).
     """
     def __init__(self, scale=1.414, rand_type="uniform"):
         super(Orthogonal, self).__init__(scale=scale, rand_type=rand_type)
         self.scale = scale
         self.rand_type = rand_type
 
-    # pylint: disable=invalid-name
     def _init_weight(self, _, arr):
         nout = arr.shape[0]
         nin = np.prod(arr.shape[1:])
@@ -339,28 +332,28 @@ def _init_weight(self, _, arr):
             tmp = np.random.uniform(-1.0, 1.0, (nout, nin))
         elif self.rand_type == "normal":
             tmp = np.random.normal(0.0, 1.0, (nout, nin))
-        u, _, v = np.linalg.svd(tmp, full_matrices=False)
+        u, _, v = np.linalg.svd(tmp, full_matrices=False) # pylint: disable=invalid-name
         if u.shape == tmp.shape:
-            q = u
+            res = u
         else:
-            q = v
-        q = self.scale * q.reshape(arr.shape)
-        arr[:] = q
+            res = v
+        res = self.scale * res.reshape(arr.shape)
+        arr[:] = res
 
 @register
 class Xavier(Initializer):
-    """Initialize the weight with Xavier or similar initialization scheme.
+    """Initialize the weight with Xavier or other similar schemes.
 
     Parameters
     ----------
     rnd_type: str, optional
-        Use ```gaussian``` or ```uniform``` to init
+        Random generator type, can be ```gaussian`` or ``uniform``.
 
     factor_type: str, optional
-        Use ```avg```, ```in```, or ```out``` to init
+        Can be ``avg``, ``in``, or ``out``.
 
     magnitude: float, optional
-        scale of random number range
+        Scale of random number range.
     """
     def __init__(self, rnd_type="uniform", factor_type="avg", magnitude=3):
         super(Xavier, self).__init__(rnd_type=rnd_type, factor_type=factor_type,
@@ -395,13 +388,16 @@ def _init_weight(self, _, arr):
 
 @register
 class MSRAPrelu(Xavier):
-    """Initialize the weight with initialization scheme from
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification.
+    """Initialize the weight according to a MSRA paper.
+
+    This initializer implements *Delving Deep into Rectifiers: Surpassing
+    Human-Level Performance on ImageNet Classification*, available at
+    https://arxiv.org/abs/1502.01852.
 
     Parameters
     ----------
     factor_type: str, optional
-        Use ```avg```, ```in```, or ```out``` to init
+        Can be ``avg``, ``in``, or ``out``.
 
     slope: float, optional
         initial slope of any PReLU (or similar) nonlinearities.
@@ -413,11 +409,10 @@ def __init__(self, factor_type="avg", slope=0.25):
 
 @register
 class Bilinear(Initializer):
-    """Initialize weight for upsampling layer"""
+    """Initialize weight for upsampling layers."""
     def __init__(self):
         super(Bilinear, self).__init__()
 
-    # pylint: disable=no-self-use, missing-docstring, invalid-name
     def _init_weight(self, _, arr):
         weight = np.zeros(np.prod(arr.shape), dtype='float32')
         shape = arr.shape
@@ -430,14 +425,37 @@ def _init_weight(self, _, arr):
         arr[:] = weight.reshape(shape)
 
 
+@register
+class LSTMBias(Initializer):
+    """Initialize all bias of an LSTMCell to 0.0 except for
+    the forget gate whose bias is set to custom value.
+
+    Parameters
+    ----------
+    forget_bias: float, bias for the forget gate.
+    Jozefowicz et al. 2015 recommends setting this to 1.0.
+    """
+    def __init__(self, forget_bias):
+        super(LSTMBias, self).__init__(forget_bias=forget_bias)
+        self.forget_bias = forget_bias
+
+    def _init_weight(self, name, arr):
+        arr[:] = 0.0
+        # in the case of LSTMCell the forget gate is the second
+        # gate of the 4 LSTM gates, we modify the according values.
+        num_hidden = int(arr.shape[0] / 4)
+        arr[num_hidden:2*num_hidden] = self.forget_bias
+
+
 @register
 class FusedRNN(Initializer):
-    """Initialze parameters for fused rnn layer
+    """Initialize parameters for fused rnn layers.
 
     Parameters
     ----------
     init : Initializer
-        intializer applied to unpacked weights.
+        intializer applied to unpacked weights. Fall back to global
+        initializer if None.
     num_hidden : int
         should be the same with arguments passed to FusedRNNCell.
     num_layers : int
@@ -446,27 +464,38 @@ class FusedRNN(Initializer):
         should be the same with arguments passed to FusedRNNCell.
     bidirectional : bool
         should be the same with arguments passed to FusedRNNCell.
+    forget_bias : float
+        should be the same with arguments passed to FusedRNNCell.
     """
-    def __init__(self, init, num_hidden, num_layers, mode, bidirectional=False):
-        if not isinstance(init, Initializer):
+    def __init__(self, init, num_hidden, num_layers, mode, bidirectional=False, forget_bias=1.0):
+        if isinstance(init, string_types):
             klass, kwargs = json.loads(init)
             init = _INITIALIZER_REGISTRY[klass.lower()](**kwargs)
-        super(FusedRNN, self).__init__(init=init.dumps(), num_hidden=num_hidden,
-                                       num_layers=num_layers, mode=mode,
-                                       bidirectional=bidirectional)
+        super(FusedRNN, self).__init__(init=init.dumps() if init is not None else None,
+                                       num_hidden=num_hidden, num_layers=num_layers, mode=mode,
+                                       bidirectional=bidirectional, forget_bias=forget_bias)
+        self._init = init
         self._num_hidden = num_hidden
         self._num_layers = num_layers
-        self._bidirectional = bidirectional
         self._mode = mode
-        self._init = init
+        self._bidirectional = bidirectional
+        self._forget_bias = forget_bias
 
-    def _init_weight(self, _, arr):
+    def _init_weight(self, desc, arr):
         from .rnn import rnn_cell
         cell = rnn_cell.FusedRNNCell(self._num_hidden, self._num_layers,
-                                     self._mode, self._bidirectional, prefix='')
+                                     self._mode, self._bidirectional,
+                                     forget_bias=self._forget_bias, prefix='')
         args = cell.unpack_weights({'parameters': arr})
         for name in args:
-            desc = InitDesc(name)
-            self._init(desc, args[name])
-        arr[:] = cell.pack_weights(args)['parameters']
+            arg_desc = InitDesc(name, global_init=desc.global_init)
+            # for lstm bias, we use a custom initializer
+            # which adds a bias to the forget gate
+            if self._mode == 'lstm' and name.endswith("_f_bias"):
+                args[name][:] = self._forget_bias
+            elif self._init is None:
+                desc.global_init(arg_desc, args[name])
+            else:
+                self._init(arg_desc, args[name])
 
+        arr[:] = cell.pack_weights(args)['parameters']
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 24db735a4209..f797bd4c6ee8 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -1,7 +1,4 @@
-# coding: utf-8
-# pylint: disable=invalid-name, protected-access, fixme, too-many-arguments, W0221, W0201, no-self-use, no-member
-
-"""NDArray interface of mxnet"""
+"""Data iterators for common data formats."""
 from __future__ import absolute_import
 from collections import OrderedDict, namedtuple
 
@@ -19,9 +16,21 @@
 from .ndarray import array
 from .ndarray import concatenate
 
-# pylint: disable=W0622
 class DataDesc(namedtuple('DataDesc', ['name', 'shape'])):
-    """Named data desc description contains name, shape, type and other extended attributes.
+    """Data description
+
+    Parameters
+    ----------
+    cls : DataDesc
+         The class.
+    name : str
+         Data name.
+    shape : tuple of int
+         Data shape.
+    dtype : np.dtype, optional
+         Data type.
+    layout : str, optional
+         Data layout.
     """
     def __new__(cls, name, shape, dtype=mx_real_t, layout='NCHW'):
         ret = super(cls, DataDesc).__new__(cls, name, shape)
@@ -37,6 +46,10 @@ def __repr__(self):
     def get_batch_axis(layout):
         """Get the dimension that corresponds to the batch size.
 
+        When data parallelism is used, the data will be automatically split and
+        concatenated along the batch-size dimension. Axis can be -1, which means
+        the whole array will be copied for each data-parallelism device.
+
         Parameters
         ----------
         layout : str
@@ -44,10 +57,8 @@ def get_batch_axis(layout):
 
         Returns
         -------
-        An axis indicating the batch_size dimension. When data-parallelism is
-        used, the data will be automatically split and concatenate along the batch_size
-        dimension. Axis can be -1, which means the whole array will be copied for each
-        data-parallelism device.
+        int
+            An axis indicating the batch_size dimension.
         """
         if layout is None:
             return 0
@@ -59,8 +70,8 @@ def get_list(shapes, types):
 
         Parameters
         ----------
-        shapes : shape tuple list with (name, shape) tuples
-        types : type tuple list with (name, type) tuples
+        shapes : a tuple of (name, shape)
+        types : a tuple of  (name, type)
         """
         if types is not None:
             type_dict = dict(types)
@@ -69,7 +80,26 @@ def get_list(shapes, types):
             return [DataDesc(x[0], x[1]) for x in shapes]
 
 class DataBatch(object):
-    """Default object for holding a mini-batch of data and related information."""
+    """A data batch.
+
+    Parameters
+    ----------
+    data : list of NDArray
+          A list of input data.
+    label : list of NDArray
+          A list of input labels.
+    pad : int, optional
+          The number of examples padded at the batch end. It is used when the
+          examples read is less than the batch size.
+    index : numpy.array, optional
+          The example indices in this batch.
+    bucket_key : int, optional
+          The key of the bucket, used for bucket IO.
+    provide_data : list of (name, shape), optional
+          The *i*-th elements describes the name and shape of ``data[i]``.
+    provide_label : list of (name, shape), optional
+          The *i*-th elements describes the name and shape of ``label[i]``.
+    """
     def __init__(self, data, label, pad=None, index=None,
                  bucket_key=None, provide_data=None, provide_label=None):
         if data is not None:
@@ -81,33 +111,48 @@ def __init__(self, data, label, pad=None, index=None,
         self.pad = pad
         self.index = index
 
-        # the following properties are only used when bucketing is used
         self.bucket_key = bucket_key
         self.provide_data = provide_data
         self.provide_label = provide_label
 
+    def __str__(self):
+        data_shapes = [d.shape for d in self.data]
+        label_shapes = [l.shape for l in self.label]
+        return "{}: data shapes: {} label shapes: {}".format(
+            self.__class__.__name__,
+            data_shapes,
+            label_shapes)
+
 class DataIter(object):
-    """DataIter object in mxnet. """
+    """The base class of a data iterator.
 
-    def __init__(self):
-        self.batch_size = 0
+    Parameters
+    ----------
+    batch_size : int, optional
+        The batch size, namely the number of examples in a batch.
+    """
+    def __init__(self, batch_size=0):
+        self.batch_size = batch_size
 
     def __iter__(self):
         return self
 
     def reset(self):
-        """Reset the iterator. """
+        """Reset the iterator to the begin of the data."""
         pass
 
     def next(self):
-        """Get next data batch from iterator. Equivalent to
-        self.iter_next()
-        DataBatch(self.getdata(), self.getlabel(), self.getpad(), None)
+        """Get next data batch from iterator.
 
         Returns
         -------
-        data : DataBatch
+        DataBatch
             The data of next batch.
+
+        Raises
+        ------
+        StopIteration
+            If the end of the data is reached.
         """
         if self.iter_next():
             return DataBatch(data=self.getdata(), label=self.getlabel(), \
@@ -119,11 +164,11 @@ def __next__(self):
         return self.next()
 
     def iter_next(self):
-        """Iterate to next batch.
+        """Move to the next batch.
 
         Returns
         -------
-        has_next : boolean
+        boolean
             Whether the move is successful.
         """
         pass
@@ -133,18 +178,18 @@ def getdata(self):
 
         Returns
         -------
-        data : NDArray
-            The data of current batch.
+        list of NDArray
+            The data of the current batch.
         """
         pass
 
     def getlabel(self):
-        """Get label of current batch.
+        """Get label of the current batch.
 
         Returns
         -------
-        label : NDArray
-            The label of current batch.
+        list of NDArray
+            The label of the current batch.
         """
         pass
 
@@ -154,33 +199,42 @@ def getindex(self):
         Returns
         -------
         index : numpy.array
-            The index of current batch
+            The indices of examples in the current batch.
         """
         return None
 
     def getpad(self):
-        """Get the number of padding examples in current batch.
+        """Get the number of padding examples in the current batch.
 
         Returns
         -------
-        pad : int
-            Number of padding examples in current batch
+        int
+            Number of padding examples in the current batch.
         """
         pass
 
 class ResizeIter(DataIter):
-    """Resize a DataIter to given number of batches per epoch.
-    May produce incomplete batch in the middle of an epoch due
-    to padding from internal iterator.
+    """Resize a data iterator to a given number of batches.
 
     Parameters
     ----------
     data_iter : DataIter
-        Internal data iterator.
-    size : number of batches per epoch to resize to.
-    reset_internal : whether to reset internal iterator on ResizeIter.reset
-    """
+        The data iterator to be resized.
+    size : int
+        The number of batches per epoch to resize to.
+    reset_internal : bool
+        Whether to reset internal iterator on ResizeIter.reset.
+
 
+    Examples
+    --------
+    >>> nd_iter = mx.io.NDArrayIter(mx.nd.ones((100,10)), batch_size=25)
+    >>> resize_iter = mx.io.ResizeIter(nd_iter, 2)
+    >>> for batch in resize_iter:
+    ...     print(batch.data)
+    [<NDArray 25x10 @cpu(0)>]
+    [<NDArray 25x10 @cpu(0)>]
+    """
     def __init__(self, data_iter, size, reset_internal=True):
         super(ResizeIter, self).__init__()
         self.data_iter = data_iter
@@ -225,25 +279,32 @@ def getpad(self):
         return self.current_batch.pad
 
 class PrefetchingIter(DataIter):
-    """Base class for prefetching iterators. Takes one or more DataIters (
-    or any class with "reset" and "next" methods) and combine them with
-    prefetching. For example:
+    """Performs pre-fetch for other data iterators.
+
+    This iterator will create another thread to perform ``iter_next`` and then
+    store the data in memory. It potentially accelerates the data read, at the
+    cost of more memory usage.
 
     Parameters
     ----------
     iters : DataIter or list of DataIter
-        one or more DataIters (or any class with "reset" and "next" methods)
+        The data iterators to be pre-fetched.
     rename_data : None or list of dict
-        i-th element is a renaming map for i-th iter, in the form of
+        The *i*-th element is a renaming map for the *i*-th iter, in the form of
         {'original_name' : 'new_name'}. Should have one entry for each entry
-        in iter[i].provide_data
+        in iter[i].provide_data.
     rename_label : None or list of dict
-        Similar to rename_data
+        Similar to ``rename_data``.
 
     Examples
     --------
-    iter = PrefetchingIter([NDArrayIter({'data': X1}), NDArrayIter({'data': X2})],
-                           rename_data=[{'data': 'data1'}, {'data': 'data2'}])
+    >>> iter1 = mx.io.NDArrayIter({'data':mx.nd.ones((100,10))}, batch_size=25)
+    >>> iter2 = mx.io.NDArrayIter({'data':mx.nd.ones((100,10))}, batch_size=25)
+    >>> piter = mx.io.PrefetchingIter([iter1, iter2],
+    ...                               rename_data=[{'data': 'data_1'}, {'data': 'data_2'}])
+    >>> print(piter.provide_data)
+    [DataDesc[data_1,(25, 10L),<type 'numpy.float32'>,NCHW],
+     DataDesc[data_2,(25, 10L),<type 'numpy.float32'>,NCHW]]
     """
     def __init__(self, iters, rename_data=None, rename_label=None):
         super(PrefetchingIter, self).__init__()
@@ -257,8 +318,8 @@ def __init__(self, iters, rename_data=None, rename_label=None):
         self.batch_size = self.provide_data[0][1][0]
         self.data_ready = [threading.Event() for i in range(self.n_iter)]
         self.data_taken = [threading.Event() for i in range(self.n_iter)]
-        for e in self.data_taken:
-            e.set()
+        for i in self.data_taken:
+            i.set()
         self.started = True
         self.current_batch = [None for i in range(self.n_iter)]
         self.next_batch = [None for i in range(self.n_iter)]
@@ -282,14 +343,13 @@ def prefetch_func(self, i):
 
     def __del__(self):
         self.started = False
-        for e in self.data_taken:
-            e.set()
+        for i in self.data_taken:
+            i.set()
         for thread in self.prefetch_threads:
             thread.join()
 
     @property
     def provide_data(self):
-        """The name and shape of data provided by this iterator"""
         if self.rename_data is None:
             return sum([i.provide_data for i in self.iters], [])
         else:
@@ -301,7 +361,6 @@ def provide_data(self):
 
     @property
     def provide_label(self):
-        """The name and shape of label provided by this iterator"""
         if self.rename_label is None:
             return sum([i.provide_label for i in self.iters], [])
         else:
@@ -312,18 +371,18 @@ def provide_label(self):
             ] for r, i in zip(self.rename_label, self.iters)], [])
 
     def reset(self):
-        for e in self.data_ready:
-            e.wait()
+        for i in self.data_ready:
+            i.wait()
         for i in self.iters:
             i.reset()
-        for e in self.data_ready:
-            e.clear()
-        for e in self.data_taken:
-            e.set()
+        for i in self.data_ready:
+            i.clear()
+        for i in self.data_taken:
+            i.set()
 
     def iter_next(self):
-        for e in self.data_ready:
-            e.wait()
+        for i in self.data_ready:
+            i.wait()
         if self.next_batch[0] is None:
             for i in self.next_batch:
                 assert i is None, "Number of entry mismatches between iterators"
@@ -338,10 +397,10 @@ def iter_next(self):
                                            self.next_batch[0].index,
                                            provide_data=self.provide_data,
                                            provide_label=self.provide_label)
-            for e in self.data_ready:
-                e.clear()
-            for e in self.data_taken:
-                e.set()
+            for i in self.data_ready:
+                i.clear()
+            for i in self.data_taken:
+                i.set()
             return True
 
     def next(self):
@@ -374,9 +433,10 @@ def _init_data(data, allow_empty, default_name):
         if not allow_empty:
             assert(len(data) > 0)
         if len(data) == 1:
-            data = OrderedDict([(default_name, data[0])])
+            data = OrderedDict([(default_name, data[0])]) # pylint: disable=redefined-variable-type
         else:
-            data = OrderedDict([('_%d_%s' % (i, default_name), d) for i, d in enumerate(data)])
+            data = OrderedDict( # pylint: disable=redefined-variable-type
+                [('_%d_%s' % (i, default_name), d) for i, d in enumerate(data)])
     if not isinstance(data, dict):
         raise TypeError("Input must be NDArray, numpy.ndarray, " + \
                 "a list of them or dict with them as values")
@@ -391,34 +451,33 @@ def _init_data(data, allow_empty, default_name):
     return list(data.items())
 
 class NDArrayIter(DataIter):
-    """NDArrayIter object in mxnet. Taking NDArray or numpy array to get dataiter.
+    """Iterating on either ``mx.nd.NDArray`` or ``numpy.ndarray``.
 
     Parameters
     ----------
-    data: NDArray or numpy.ndarray, a list of them, or a dict of string to them.
-        NDArrayIter supports single or multiple data and label.
-    label: NDArray or numpy.ndarray, a list of them, or a dict of them.
-        Same as data, but is not fed to the model during testing.
+    data: array or list of array or dict of string to array
+        Input data
+    label: array or list of array or dict of string to array, optional
+        Input label
     batch_size: int
         Batch Size
-    shuffle: bool
+    shuffle: bool, optional
         Whether to shuffle the data
-    last_batch_handle: 'pad', 'discard' or 'roll_over'
-        How to handle the last batch
-
-    Note
-    ----
-    This iterator will pad, discard or roll over the last batch if
-    the size of data does not match batch_size. Roll over is intended
-    for training and can cause problems if used for prediction.
+    last_batch_handle : str, optional
+        How to handle the last batch, can be 'pad', 'discard' or
+        'roll_over'. 'roll_over' is intended for training and can cause problems
+        if used for prediction.
+    data_name : str, optional
+        The data name
+    label_name : str, optional
+        The label name
     """
     def __init__(self, data, label=None, batch_size=1, shuffle=False,
-                 last_batch_handle='pad', label_name='softmax_label'):
-        # pylint: disable=W0201
+                 last_batch_handle='pad', data_name='data',
+                 label_name='softmax_label'):
+        super(NDArrayIter, self).__init__(batch_size)
 
-        super(NDArrayIter, self).__init__()
-
-        self.data = _init_data(data, allow_empty=False, default_name='data')
+        self.data = _init_data(data, allow_empty=False, default_name=data_name)
         self.label = _init_data(label, allow_empty=True, default_name=label_name)
 
         # shuffle data
@@ -451,7 +510,7 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False,
 
     @property
     def provide_data(self):
-        """The name and shape of data provided by this iterator"""
+        """The name and shape of data provided by this iterator."""
         return [
             DataDesc(k, tuple([self.batch_size] + list(v.shape[1:])), v.dtype)
             for k, v in self.data
@@ -459,14 +518,14 @@ def provide_data(self):
 
     @property
     def provide_label(self):
-        """The name and shape of label provided by this iterator"""
+        """The name and shape of label provided by this iterator."""
         return [
             DataDesc(k, tuple([self.batch_size] + list(v.shape[1:])), v.dtype)
             for k, v in self.label
         ]
 
     def hard_reset(self):
-        """Igore roll over data and set to start"""
+        """Ignore roll over data and set to start."""
         self.cursor = -self.batch_size
 
     def reset(self):
@@ -487,7 +546,7 @@ def next(self):
             raise StopIteration
 
     def _getdata(self, data_source):
-        """Load data from underlying arrays, internal use only"""
+        """Load data from underlying arrays, internal use only."""
         assert(self.cursor < self.num_data), "DataIter needs reset."
         if self.cursor + self.batch_size <= self.num_data:
             return [x[1][self.cursor:self.cursor+self.batch_size] for x in data_source]
@@ -510,12 +569,12 @@ def getpad(self):
 
 
 class MXDataIter(DataIter):
-    """DataIter built in MXNet. List all the needed functions here.
+    """A python wrapper a C++ data iterator.
 
     Parameters
     ----------
     handle : DataIterHandle
-        the handle to the underlying C++ Data Iterator
+        The handle to the underlying C++ Data Iterator.
     """
     def __init__(self, handle, data_name='data', label_name='softmax_label', **_):
         super(MXDataIter, self).__init__()
@@ -539,12 +598,9 @@ def __del__(self):
         check_call(_LIB.MXDataIterFree(self.handle))
 
     def debug_skip_load(self):
-        """Set the iterator to simply return always first batch.
-        Notes
-        -----
-        This can be used to test the speed of network without taking
-        the loading delay into account.
-        """
+        # Set the iterator to simply return always first batch. This can be used
+        # to test the speed of network without taking the loading delay into
+        # account.
         self._debug_skip_load = True
         logging.info('Set debug_skip_load to be true, will simply return first batch')
 
@@ -628,11 +684,9 @@ def _make_io_iterator(handle):
 
     doc_str = ('%s\n\n' +
                '%s\n' +
-               'name : string, required.\n' +
-               '    Name of the resulting data iterator.\n\n' +
                'Returns\n' +
                '-------\n' +
-               'iterator: DataIter\n'+
+               'MXDataIter\n'+
                '    The result iterator.')
     doc_str = doc_str % (desc.value, param_str)
 
@@ -648,7 +702,7 @@ def creator(*args, **kwargs):
         Returns
         -------
         dataiter: Dataiter
-            the resulting data iterator
+            The resulting data iterator.
         """
         param_keys = []
         param_vals = []
@@ -686,5 +740,4 @@ def _init_io_module():
         dataiter = _make_io_iterator(hdl)
         setattr(module_obj, dataiter.__name__, dataiter)
 
-# Initialize the io in startups
 _init_io_module()
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index 269a96720c90..2a955dd8aeaa 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -12,7 +12,7 @@
 
 def _ctype_key_value(keys, vals):
     """
-    Return ctype arrays for the key-value args, for internal use
+    Return ctype arrays for the key-value args, for internal use.
     """
     if isinstance(keys, int):
         if isinstance(vals, NDArray):
@@ -37,7 +37,7 @@ def _ctype_key_value(keys, vals):
 
 
 def _updater_wrapper(updater):
-    """ a wrapper for the user-defined handle """
+    """A wrapper for the user-defined handle."""
     def updater_handle(key, lhs_handle, rhs_handle, _):
         """ ctypes function """
         lhs = NDArray(NDArrayHandle(lhs_handle))
@@ -54,7 +54,7 @@ def __init__(self, handle):
         Parameters
         ----------
         handle : KVStoreHandle
-            KVStore handle of C API
+            KVStore handle of C API.
         """
         assert isinstance(handle, KVStoreHandle)
         self.handle = handle
@@ -71,7 +71,7 @@ def init(self, key, value):
 
         Only worker 0's (rank == 0) data are used.
 
-        This function returns after data have been initialized successfully
+        This function returns after data have been initialized successfully.
 
         Parameters
         ----------
@@ -105,13 +105,13 @@ def push(self, key, value, priority=0):
 
         Data consistency:
 
-        1. this function returns after adding an operator to the engine.
+        1. This function returns after adding an operator to the engine.
 
-        2. push is always called after all previous push and pull on the same
+        2. ``push`` is always called after all previous push and pull on the same
         key are finished
 
-        3. there is no synchronization between workers. One can use _barrier()
-        to sync all workers
+        3. There is no synchronization between workers. One can use _barrier()
+        to sync all workers.
 
         Parameters
         ----------
@@ -171,21 +171,21 @@ def pull(self, key, out=None, priority=0):
 
         Data consistency:
 
-        1. this function returns after adding an operator to the engine. But any
+        1. This function returns after adding an operator to the engine. But any
         further read on out will be blocked until it is finished.
 
-        2. pull is always called after all previous push and pull on the same
-        key are finished
+        2. ``pull`` is always called after all previous push and pull on the same
+        key are finished.
 
         3. It pulls the newest value from the store.
 
         Parameters
         ----------
         key : int or list of int
-            Keys
+            Keys.
 
-        out: NDArray or list of NDArray or list of list of NDArray
-            According values
+        out: NDArray or list of NDArray or list of lists of NDArrays
+            According values.
 
         priority : int, optional
             The priority of the push operation.
@@ -258,7 +258,7 @@ def set_optimizer(self, optimizer):
 
     @property
     def type(self):
-        """Get the type of this kvstore
+        """Get the type of this kvstore.
 
         Returns
         -------
@@ -271,7 +271,7 @@ def type(self):
 
     @property
     def rank(self):
-        """Get the rank of this worker node
+        """Get the rank of this worker node.
 
         Returns
         -------
@@ -284,19 +284,19 @@ def rank(self):
 
     @property
     def num_workers(self):
-        """Get the number of worker nodes
+        """Get the number of worker nodes.
 
         Returns
         -------
         size :int
-            The number of worker nodes
+            The number of worker nodes.
         """
         size = ctypes.c_int()
         check_call(_LIB.MXKVStoreGetGroupSize(self.handle, ctypes.byref(size)))
         return size.value
 
     def save_optimizer_states(self, fname):
-        """Save optimizer (updater) state to file
+        """Save optimizer (updater) state to file.
 
         Parameters
         ----------
@@ -308,7 +308,7 @@ def save_optimizer_states(self, fname):
             fout.write(self._updater.get_states())
 
     def load_optimizer_states(self, fname):
-        """Load optimizer (updater) state from file
+        """Load optimizer (updater) state from file.
 
         Parameters
         ----------
@@ -322,12 +322,12 @@ def _set_updater(self, updater):
         """Set a push updater into the store.
 
         This function only changes the local store. Use set_optimizer for
-        multi-machines.
+        multiple machines.
 
         Parameters
         ----------
         updater : function
-            the updater function
+            The updater function.
 
         Examples
         --------
@@ -354,23 +354,23 @@ def _set_updater(self, updater):
 
 
     def _barrier(self):
-        """Global barrier among all worker nodes
+        """Global barrier among all worker nodes.
 
-        For example, assume there are n machines, we want to let machine 0 first
-        init the values, and then pull the inited value to all machines. Before
+        For example, assume there are n machines. We want to let machine 0 first
+        init the values, and then pull the initialized value to all machines. Before
         pulling, we can place a barrier to guarantee that the initialization is
         finished.
         """
         check_call(_LIB.MXKVStoreBarrier(self.handle))
 
     def _send_command_to_servers(self, head, body):
-        """Send a command to all server nodes
+        """Send a command to all server nodes.
 
         Send a command to all server nodes, which will make each server node run
         KVStoreServer.controller
 
         This function returns after the command has been executed in all server
-        nodes
+        nodes.
 
         Parameters
         ----------
@@ -389,15 +389,15 @@ def create(name='local'):
     ----------
     name : {'local'}
         The type of KVStore
-        - local works for multiple devices on a single machine (single process)
-        - dist works for multi-machines (multiple processes)
+        - local works for multiple devices on a single machine (single process).
+        - dist works for multiple machines (multiple processes).
     Returns
     -------
     kv : KVStore
-        The created KVStore
+        The created KVStore.
     """
     if not isinstance(name, string_types):
-        raise TypeError('name need to be string')
+        raise TypeError('name must be a string')
     handle = KVStoreHandle()
     check_call(_LIB.MXKVStoreCreate(c_str(name),
                                     ctypes.byref(handle)))
diff --git a/python/mxnet/kvstore_server.py b/python/mxnet/kvstore_server.py
index b006a9e5c7fc..c6d0b073f824 100644
--- a/python/mxnet/kvstore_server.py
+++ b/python/mxnet/kvstore_server.py
@@ -1,5 +1,5 @@
 # coding: utf-8
-""" a server node for the key value store """
+"""A server node for the key value store."""
 from __future__ import absolute_import
 import ctypes
 import sys
@@ -9,7 +9,7 @@
 from .kvstore import create
 
 class KVStoreServer(object):
-    """The key-value store server"""
+    """The key-value store server."""
     def __init__(self, kvstore):
         """Initialize a new KVStoreServer.
 
@@ -22,9 +22,9 @@ def __init__(self, kvstore):
         self.init_logginig = False
 
     def _controller(self):
-        """return the server controller"""
+        """Return the server controller."""
         def server_controller(cmd_id, cmd_body, _):
-            """server controler"""
+            """Server controler."""
             if not self.init_logginig:
                 # the reason put the codes here is because we cannot get
                 # kvstore.rank earlier
@@ -45,7 +45,7 @@ def server_controller(cmd_id, cmd_body, _):
         return server_controller
 
     def run(self):
-        """run the server, whose behavior is like
+        """Run the server, whose behavior is like.
 
 
         >>> while receive(x):
@@ -56,7 +56,7 @@ def run(self):
         check_call(_LIB.MXKVStoreRunServer(self.handle, _ctrl_proto(self._controller()), None))
 
 def _init_kvstore_server_module():
-    """Start server/scheduler"""
+    """Start server/scheduler."""
     is_worker = ctypes.c_int()
     check_call(_LIB.MXKVStoreIsWorkerNode(ctypes.byref(is_worker)))
     if is_worker.value == 0:
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index dd1d5b471092..5689a106c4eb 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -10,7 +10,7 @@ def find_lib_path():
     Returns
     -------
     lib_path : list(string)
-        List of all found path to the libraries
+        List of all found path to the libraries.
     """
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     api_path = os.path.join(curr_path, '../../lib/')
@@ -28,8 +28,13 @@ def find_lib_path():
     elif os.name == "posix" and os.environ.get('LD_LIBRARY_PATH', None):
         dll_path.extend([p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")])
     if os.name == 'nt':
+        os.environ['PATH'] = os.path.dirname(__file__) + ';' + os.environ['PATH']
         dll_path = [os.path.join(p, 'libmxnet.dll') for p in dll_path]
+    elif platform.system() == 'Darwin':
+        dll_path = [os.path.join(p, 'libmxnet.dylib') for p in dll_path]+ \
+                   [os.path.join(p, 'libmxnet.so') for p in dll_path]
     else:
+        dll_path.append('../../../')
         dll_path = [os.path.join(p, 'libmxnet.so') for p in dll_path]
     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
     if len(lib_path) == 0:
@@ -39,4 +44,4 @@ def find_lib_path():
 
 
 # current version
-__version__ = "0.9.4"
+__version__ = "0.9.5"
diff --git a/python/mxnet/log.py b/python/mxnet/log.py
new file mode 100644
index 000000000000..e03177fa4e8c
--- /dev/null
+++ b/python/mxnet/log.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# pylint: disable= protected-access, invalid-name
+"""Logging utilities."""
+import logging
+import sys
+
+CRITICAL = logging.CRITICAL
+ERROR = logging.ERROR
+WARNING = logging.WARNING
+INFO = logging.INFO
+DEBUG = logging.DEBUG
+NOTSET = logging.NOTSET
+
+PY3 = sys.version_info[0] == 3
+
+
+class _Formatter(logging.Formatter):
+    # pylint: disable= no-self-use
+    """Customized log formatter."""
+
+    def __init__(self):
+        datefmt = '%m%d %H:%M:%S'
+        super(_Formatter, self).__init__(datefmt=datefmt)
+
+    def _get_color(self, level):
+        # pylint: disable= missing-docstring
+        if logging.WARNING <= level:
+            return '\x1b[31m'
+        elif logging.INFO <= level:
+            return '\x1b[32m'
+        else:
+            return '\x1b[34m'
+
+    def _get_label(self, level):
+        # pylint: disable= missing-docstring
+        if level == logging.CRITICAL:
+            return 'C'
+        elif level == logging.ERROR:
+            return 'E'
+        elif level == logging.WARNING:
+            return 'W'
+        elif level == logging.INFO:
+            return 'I'
+        elif level == logging.DEBUG:
+            return 'D'
+        else:
+            return 'U'
+
+    def format(self, record):
+        # pylint: disable= missing-docstring
+        fmt = self._get_color(record.levelno)
+        fmt += self._get_label(record.levelno)
+        fmt += '%(asctime)s %(process)d %(pathname)s:%(funcName)s:%(lineno)d'
+        fmt += ']\x1b[0m'
+        fmt += ' %(message)s'
+        if PY3:
+            self._style._fmt = fmt # pylint: disable= no-member
+        else:
+            self._fmt = fmt
+        return super(_Formatter, self).format(record)
+
+def getLogger(name=None, filename=None, filemode=None, level=WARNING):
+    """Get customized logger.
+
+    Args:
+        name: Name of the logger.
+        level: Level to log.
+
+    Returns:
+        A logger.
+    """
+    logger = logging.getLogger(name)
+    if name is not None and not getattr(logger, '_init_done', None):
+        logger._init_done = True
+        if filename:
+            mode = filemode if filemode else 'a'
+            hdlr = logging.FileHandler(filename, mode)
+        else:
+            hdlr = logging.StreamHandler() # pylint: disable=redefined-variable-type
+            # the `_Formatter` contain some escape character to
+            # represent color, which is not suitable for FileHandler,
+            # (TODO) maybe we can add another Formatter for FileHandler.
+            hdlr.setFormatter(_Formatter())
+        logger.addHandler(hdlr)
+        logger.setLevel(level)
+    return logger
diff --git a/python/mxnet/lr_scheduler.py b/python/mxnet/lr_scheduler.py
index 2adbded5a3d0..ec410d9d5093 100644
--- a/python/mxnet/lr_scheduler.py
+++ b/python/mxnet/lr_scheduler.py
@@ -1,31 +1,31 @@
-"""
-learning rate scheduler, which adaptive changes the learning rate based on the
-progress
-"""
+"""Scheduling learning rate."""
 import logging
 import math
 
 class LRScheduler(object):
-    """Base class of a learning rate scheduler"""
-    def __init__(self):
-        """
-        base_lr : float
-            the initial learning rate
-        """
-        self.base_lr = 0.01
+    """Base class of a learning rate scheduler.
+
+    A scheduler returns a new learning rate based on the number of updates that have
+    been performed.
+
+    Parameters
+    ----------
+    base_lr : float, optional
+        The initial learning rate.
+    """
+    def __init__(self, base_lr=0.01):
+        self.base_lr = base_lr
 
     def __call__(self, num_update):
-        """
-        Call to schedule current learning rate
+        """Return a new learning rate.
 
-        The training progress is presented by `num_update`, which can be roughly
-        viewed as the number of minibatches executed so far. Its value is
-        non-decreasing, and increases at most by one.
+        The ``num_update`` is the upper bound of the number of updates applied to
+        every weight.
 
-        The exact value is the upper bound of the number of updates applied to
-        a weight/index
+        Assume the optimizer has udpated *i*-th weight by *k_i* times, namely
+        ``optimizer.update(i, weight_i)`` is called by *k_i* times. Then::
 
-        See more details in https://github.com/dmlc/mxnet/issues/625
+            num_update = max([k_i for all i])
 
         Parameters
         ----------
@@ -35,19 +35,20 @@ def __call__(self, num_update):
         raise NotImplementedError("must override this")
 
 class FactorScheduler(LRScheduler):
-    """Reduce learning rate in factor
+    """Reduce the learning rate by a factor for every *n* steps.
 
-    Assume the weight has been updated by n times, then the learning rate will
-    be
+    It returns a new learning rate by::
 
-    base_lr * factor^(floor(n/step))
+        base_lr * pow(factor, floor(num_update/step))
 
     Parameters
     ----------
-    step: int
-        schedule learning rate after n updates
-    factor: float
-        the factor for reducing the learning rate
+    step : int
+        Changes the learning rate for every n updates.
+    factor : float, optional
+        The factor to change the learning rate.
+    stop_factor_lr : float, optional
+        Stop updating the learning rate if it is less than this value.
     """
     def __init__(self, step, factor=1, stop_factor_lr=1e-8):
         super(FactorScheduler, self).__init__()
@@ -61,15 +62,6 @@ def __init__(self, step, factor=1, stop_factor_lr=1e-8):
         self.count = 0
 
     def __call__(self, num_update):
-        """
-        Call to schedule current learning rate
-
-        Parameters
-        ----------
-        num_update: int
-            the maximal number of updates applied to a weight.
-        """
-
         # NOTE: use while rather than if  (for continuing training via load_epoch)
         while num_update > self.count + self.step:
             self.count += self.step
@@ -84,19 +76,22 @@ def __call__(self, num_update):
         return self.base_lr
 
 class MultiFactorScheduler(LRScheduler):
-    """Reduce learning rate in factor at steps specified in a list
+    """Reduce the learning rate by given a list of steps.
 
-    Assume the weight has been updated by n times, then the learning rate will
-    be
+    Assume there exists *k* such that::
+
+       step[k] <= num_update and num_update < step[k+1]
 
-    base_lr * factor^(sum((step/n)<=1)) # step is an array
+    Then calculate the new learning rate by::
+
+       base_lr * pow(factor, k+1)
 
     Parameters
     ----------
     step: list of int
-        schedule learning rate after n updates
+        The list of steps to schedule a change
     factor: float
-        the factor for reducing the learning rate
+        The factor to change the learning rate.
     """
     def __init__(self, step, factor=1):
         super(MultiFactorScheduler, self).__init__()
@@ -114,15 +109,6 @@ def __init__(self, step, factor=1):
         self.count = 0
 
     def __call__(self, num_update):
-        """
-        Call to schedule current learning rate
-
-        Parameters
-        ----------
-        num_update: int
-            the maximal number of updates applied to a weight.
-        """
-
         # NOTE: use while rather than if  (for continuing training via load_epoch)
         while self.cur_step_ind <= len(self.step)-1:
             if num_update > self.step[self.cur_step_ind]:
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 7ce44a83ef1f..771831e0d06d 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -71,7 +71,7 @@ def get(self):
             return (names, values)
 
     def get_name_value(self):
-        """Get zipped name and value pairs"""
+        """Get zipped name and value pairs."""
         name, value = self.get()
         if not isinstance(name, list):
             name = [name]
@@ -79,6 +79,10 @@ def get_name_value(self):
             value = [value]
         return zip(name, value)
 
+    def __str__(self):
+        return "EvalMetric: {}".format(dict(self.get_name_value()))
+
+
 class CompositeEvalMetric(EvalMetric):
     """Manage multiple evaluation metrics."""
 
@@ -126,7 +130,7 @@ def get(self):
 ########################
 
 class Accuracy(EvalMetric):
-    """Calculate accuracy"""
+    """Calculate accuracy."""
 
     def __init__(self):
         super(Accuracy, self).__init__('accuracy')
@@ -146,7 +150,7 @@ def update(self, labels, preds):
             self.num_inst += len(pred_label.flat)
 
 class TopKAccuracy(EvalMetric):
-    """Calculate top k predictions accuracy"""
+    """Calculate top k predictions accuracy."""
 
     def __init__(self, **kwargs):
         super(TopKAccuracy, self).__init__('top_k_accuracy')
@@ -224,18 +228,23 @@ def update(self, labels, preds):
 
 
 class Perplexity(EvalMetric):
-    """Calculate perplexity
+    """Calculate perplexity.
 
     Parameters
     ----------
     ignore_label : int or None
-        index of invalid label to ignore when
-        counting. usually should be -1. Include
+        Index of invalid label to ignore when
+        counting. Usually should be -1. Include
         all entries if None.
+    axis : int (default -1)
+        The axis from prediction that was used to
+        compute softmax. By default use the last
+        axis.
     """
-    def __init__(self, ignore_label):
+    def __init__(self, ignore_label, axis=-1):
         super(Perplexity, self).__init__('Perplexity')
         self.ignore_label = ignore_label
+        self.axis = axis
 
     def update(self, labels, preds):
         assert len(labels) == len(preds)
@@ -247,7 +256,7 @@ def update(self, labels, preds):
             assert label.size == pred.size/pred.shape[-1], \
                 "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
             label = label.as_in_context(pred.context).astype(dtype='int32').reshape((label.size,))
-            pred = ndarray.batch_take(pred, label)
+            pred = ndarray.pick(pred, label, axis=self.axis)
             probs.append(pred)
 
         for label, prob in zip(labels, probs):
@@ -269,7 +278,7 @@ def update(self, labels, preds):
 ####################
 
 class MAE(EvalMetric):
-    """Calculate Mean Absolute Error loss"""
+    """Calculate Mean Absolute Error (MAE) loss."""
 
     def __init__(self):
         super(MAE, self).__init__('mae')
@@ -288,7 +297,7 @@ def update(self, labels, preds):
             self.num_inst += 1 # numpy.prod(label.shape)
 
 class MSE(EvalMetric):
-    """Calculate Mean Squared Error loss"""
+    """Calculate Mean Squared Error (MSE) loss."""
     def __init__(self):
         super(MSE, self).__init__('mse')
 
@@ -306,7 +315,7 @@ def update(self, labels, preds):
             self.num_inst += 1 # numpy.prod(label.shape)
 
 class RMSE(EvalMetric):
-    """Calculate Root Mean Squred Error loss"""
+    """Calculate Root Mean Squred Error (RMSE) loss."""
     def __init__(self):
         super(RMSE, self).__init__('rmse')
 
@@ -324,7 +333,7 @@ def update(self, labels, preds):
             self.num_inst += 1
 
 class CrossEntropy(EvalMetric):
-    """Calculate Cross Entropy loss"""
+    """Calculate Cross Entropy loss."""
     def __init__(self, eps=1e-8):
         super(CrossEntropy, self).__init__('cross-entropy')
         self.eps = eps
@@ -344,7 +353,7 @@ def update(self, labels, preds):
             self.num_inst += label.shape[0]
 
 class Torch(EvalMetric):
-    """Dummy metric for torch criterions"""
+    """Dummy metric for torch criterions."""
     def __init__(self, name='torch'):
         super(Torch, self).__init__(name)
 
@@ -366,7 +375,7 @@ class CustomMetric(EvalMetric):
     feval : callable(label, pred)
         Customized evaluation function.
     name : str, optional
-        The name of the metric
+        The name of the metric.
     allow_extra_outputs : bool
         If true, the prediction outputs can have extra outputs.
         This is useful in RNN, where the states are also produced
@@ -407,7 +416,7 @@ def np(numpy_feval, name=None, allow_extra_outputs=False):
     numpy_feval : callable(label, pred)
         Customized evaluation function.
         This will get called with the labels and predictions
-        for a minibatch, each as numpy arrays.  This function
+        for a minibatch, each as NumPy arrays.  This function
         should return a single float.
     name : str, optional
         The name of the metric.
diff --git a/python/mxnet/misc.py b/python/mxnet/misc.py
index 51bcf9d128fb..b158981ecf97 100644
--- a/python/mxnet/misc.py
+++ b/python/mxnet/misc.py
@@ -1,35 +1,35 @@
 # pylint: disable=invalid-name
-"""learning rate scheduler"""
+"""Learning rate scheduler."""
 
 import math
 import logging
 
 class LearningRateScheduler(object):
-    """Base class of learning rate scheduler"""
+    """Base class of learning rate scheduler."""
     def __init__(self):
         self.base_lr = 0.01
 
     def __call__(self, iteration):
         """
-        Call to schedule current learning rate
+        Call to schedule current learning rate.
 
         Parameters
         ----------
         iteration: int
-            Current iteration count
+            Current iteration count.
         """
         raise NotImplementedError("must override this")
 
 
 class FactorScheduler(LearningRateScheduler):
-    """Reduce learning rate in factor
+    """Reduce learning rate in factor.
 
     Parameters
     ----------
     step: int
-        schedule learning rate after every round
+        Schedule learning rate after every round.
     factor: float
-        reduce learning rate factor
+        Reduce learning rate factor.
     """
     def __init__(self, step, factor=0.1):
         super(FactorScheduler, self).__init__()
@@ -44,12 +44,12 @@ def __init__(self, step, factor=0.1):
 
     def __call__(self, iteration):
         """
-        Call to schedule current learning rate
+        Call to schedule current learning rate.
 
         Parameters
         ----------
         iteration: int
-            Current iteration count
+            Current iteration count.
         """
 
         if not self.init:
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 4588ea71c779..a0b1e45377c4 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -39,16 +39,16 @@
 
 def _create_kvstore(kvstore, num_device, arg_params):
     """Create kvstore
-    This function select and create a proper kvstore if given the kvstore type
+    This function select and create a proper kvstore if given the kvstore type.
 
     Parameters
     ----------
     kvstore : KVStore or str
-        The kvstore
+        The kvstore.
     num_device : int
         The number of devices
-    arg_params : dict of str to NDArray
-        Model parameter, dict of name to NDArray of net's weights.
+    arg_params : dict of str to ``NDArray``.
+        Model parameter, dict of name to ``NDArray`` of net's weights.
     """
     update_on_kvstore = True
     if kvstore is None:
@@ -119,7 +119,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
 def _multiple_callbacks(callbacks, *args, **kwargs):
     """Sends args and kwargs to any configured callbacks.
     This handles the cases where the 'callbacks' variable
-    is None, a single function, or a list.
+    is ``None``, a single function, or a list.
     """
     if isinstance(callbacks, list):
         for cb in callbacks:
@@ -144,7 +144,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
     Parameters
     ----------
     symbol : Symbol
-        The network configuration
+        The network configuration.
     ctx : list of Context
         The training devices.
     arg_names: list of str
@@ -163,7 +163,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
         The end training epoch.
     epoch_size : int, optional
         Number of batches in a epoch. In default, it is set to
-        ceil(num_train_examples / batch_size)
+        ``ceil(num_train_examples / batch_size)``.
     optimizer : Optimizer
         The optimization algorithm
     train_data : DataIter
@@ -179,20 +179,20 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
         A callback that is invoked at end of each batch.
         This can be used to measure speed, get result from evaluation metric. etc.
     kvstore : KVStore
-        The KVStore
+        The KVStore.
     update_on_kvstore : bool
-        whether or not perform weight updating on kvstore
+        Whether or not perform weight updating on kvstore.
     logger : logging logger
         When not specified, default logger will be used.
     work_load_list : list of float or int, optional
         The list of work load for different devices,
-        in the same order as ctx
+        in the same order as ``ctx``.
     monitor : Monitor, optional
         Monitor installed to executor,
         for monitoring outputs, weights, and gradients for debugging.
     Notes
     -----
-    - This function will inplace update the NDArrays in arg_params and aux_states.
+    - This function will inplace update the ``NDArrays`` in ``arg_params`` and ``aux_states``.
     """
     if logger is None:
         logger = logging
@@ -261,7 +261,7 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
 
                 nbatch += 1
                 # batch callback (for print purpose)
-                if batch_end_callback != None:
+                if batch_end_callback is not None:
                     batch_end_params = BatchEndParam(epoch=epoch,
                                                      nbatch=nbatch,
                                                      eval_metric=eval_metric,
@@ -298,14 +298,14 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                 executor_manager.load_data_batch(eval_batch)
                 executor_manager.forward(is_train=False)
                 executor_manager.update_metric(eval_metric, eval_batch.label)
-                if eval_batch_end_callback != None:
+                if eval_batch_end_callback is not None:
                     batch_end_params = BatchEndParam(epoch=epoch,
                                                      nbatch=i,
                                                      eval_metric=eval_metric,
                                                      locals=locals())
                     _multiple_callbacks(eval_batch_end_callback, batch_end_params)
                 total_num_batch += 1
-            if eval_end_callback != None:
+            if eval_end_callback is not None:
                 eval_end_params = BatchEndParam(epoch=epoch,
                                                 nbatch=total_num_batch,
                                                 eval_metric=eval_metric,
@@ -326,7 +326,7 @@ def save_checkpoint(prefix, epoch, symbol, arg_params, aux_params):
     epoch : int
         The epoch number of the model.
     symbol : Symbol
-        The input symbol
+        The input Symbol.
     arg_params : dict of str to NDArray
         Model parameter, dict of name to NDArray of net's weights.
     aux_params : dict of str to NDArray
@@ -367,8 +367,8 @@ def load_checkpoint(prefix, epoch):
 
     Notes
     -----
-    - symbol will be loaded from ``prefix-symbol.json``.
-    - parameters will be loaded from ``prefix-epoch.params``.
+    - Symbol will be loaded from ``prefix-symbol.json``.
+    - Parameters will be loaded from ``prefix-epoch.params``.
     """
     symbol = sym.load('%s-symbol.json' % prefix)
     save_dict = nd.load('%s-%04d.params' % (prefix, epoch))
@@ -382,7 +382,7 @@ def load_checkpoint(prefix, epoch):
             aux_params[name] = v
     return (symbol, arg_params, aux_params)
 
-from .callback import LogValidationMetricsCallback
+from .callback import LogValidationMetricsCallback # pylint: disable=wrong-import-position
 
 class FeedForward(BASE_ESTIMATOR):
     """Model class of MXNet for training and predicting feedforward nets.
@@ -399,7 +399,7 @@ class FeedForward(BASE_ESTIMATOR):
         Training parameter, number of training epochs(epochs).
     epoch_size : int, optional
         Number of batches in a epoch. In default, it is set to
-        ceil(num_train_examples / batch_size)
+        ``ceil(num_train_examples / batch_size)``.
     optimizer : str or Optimizer, optional
         Training parameter, name or optimizer object for training.
     initializer : initializer function, optional
@@ -413,9 +413,9 @@ class FeedForward(BASE_ESTIMATOR):
         Model parameter, dict of name to NDArray of net's auxiliary states.
     allow_extra_params : boolean, optional
         Whether allow extra parameters that are not needed by symbol
-        to be passed by aux_params and arg_params.
-        If this is True, no error will be thrown when aux_params and arg_params
-        contain extra parameters than needed.
+        to be passed by aux_params and ``arg_params``.
+        If this is True, no error will be thrown when ``aux_params`` and ``arg_params``
+        contain more parameters than needed.
     begin_epoch : int, optional
         The begining training epoch.
     kwargs : dict
@@ -496,7 +496,7 @@ def _is_data_arg(name):
         return name.endswith('data') or name.endswith('label')
 
     def _init_params(self, input_shapes, overwrite=False):
-        """Initialize weight parameters and auxiliary states"""
+        """Initialize weight parameters and auxiliary states."""
         arg_shapes, _, aux_shapes = self.symbol.infer_shape(**input_shapes)
         assert(arg_shapes is not None)
 
@@ -601,7 +601,7 @@ def predict(self, X, num_batch=None, return_data=False, reset=True):
         ----------
         X : mxnet.DataIter
         num_batch : int or None
-            the number of batch to run. Go though all batches if None
+            The number of batch to run. Go though all batches if ``None``.
         Returns
         -------
         y : numpy.ndarray or a list of numpy.ndarray if the network has multiple outputs.
@@ -664,19 +664,19 @@ def predict(self, X, num_batch=None, return_data=False, reset=True):
             return outputs
 
     def score(self, X, eval_metric='acc', num_batch=None, batch_end_callback=None, reset=True):
-        """Run the model on X and calculate the score with eval_metric
+        """Run the model on X and calculate the score with eval_metric.
 
         Parameters
         ----------
         X : mxnet.DataIter
         eval_metric : metric.metric
-            The metric for calculating score
+            The metric for calculating score.
         num_batch : int or None
-            the number of batch to run. Go though all batches if None
+            The number of batch to run. Go though all batches if ``None``.
         Returns
         -------
         s : float
-            the final score
+            The final score.
         """
         # setup metric
         if not isinstance(eval_metric, metric.EvalMetric):
@@ -705,7 +705,7 @@ def score(self, X, eval_metric='acc', num_batch=None, batch_end_callback=None, r
             self._pred_exec.forward(is_train=False)
             eval_metric.update(batch.label, self._pred_exec.outputs)
 
-            if batch_end_callback != None:
+            if batch_end_callback is not None:
                 batch_end_params = BatchEndParam(epoch=0,
                                                  nbatch=i,
                                                  eval_metric=eval_metric,
@@ -722,27 +722,26 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
         Parameters
         ----------
         X : DataIter, or numpy.ndarray/NDArray
-            Training data. If X is an DataIter, the name or, if not available,
-            position, of its outputs should match the corresponding variable
+            Training data. If X is a DataIter, the name or (if name not available)
+            the position of its outputs should match the corresponding variable
             names defined in the symbolic graph.
         y : numpy.ndarray/NDArray, optional
             Training set label.
             If X is numpy.ndarray/NDArray, y is required to be set.
-            While y can be 1D or 2D (with 2nd dimension as 1), its 1st dimension must be
+            While y can be 1D or 2D (with 2nd dimension as 1), its first dimension must be
             the same as X, i.e. the number of data points and labels should be equal.
         eval_data : DataIter or numpy.ndarray/list/NDArray pair
             If eval_data is numpy.ndarray/list/NDArray pair,
             it should be (valid_data, valid_label).
         eval_metric : metric.EvalMetric or str or callable
-            The evaluation metric, name of evaluation metric.
-            Or a customize evaluation function that returns the statistics
-            based on minibatch.
+            The evaluation metric. This could be the name of evaluation metric
+            or a custom evaluation function that returns statistics
+            based on a minibatch.
         epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
             A callback that is invoked at end of each epoch.
             This can be used to checkpoint model each epoch.
         batch_end_callback: callable(epoch)
-            A callback that is invoked at end of each batch
-            For print purpose
+            A callback that is invoked at end of each batch for purposes of printing.
         kvstore: KVStore or str, optional
            The KVStore or a string kvstore type: 'local', 'dist_sync', 'dist_async'
            In default uses 'local', often no need to change for single machiine.
@@ -750,14 +749,14 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
             When not specified, default logger will be used.
         work_load_list : float or int, optional
             The list of work load for different devices,
-            in the same order as ctx
+            in the same order as ctx.
 
         Note
         ----
         KVStore behavior
         - 'local', multi-devices on a single machine, will automatically choose best type.
-        - 'dist_sync', multi-machines with BSP
-        - 'dist_async', multi-machines with partical asynchronous
+        - 'dist_sync', multiple machines communicating via BSP.
+        - 'dist_async', multiple machines with asynchronous communication.
         """
 
         data = self._init_iter(X, y, is_train=True)
@@ -818,10 +817,10 @@ def fit(self, X, y=None, eval_data=None, eval_metric='acc',
 
     def save(self, prefix, epoch=None):
         """Checkpoint the model checkpoint into file.
-        You can also use pickle to do the job if you only work on python.
-        The advantage of load/save is the file is language agnostic.
-        This means the file saved using save can be loaded by other language binding of mxnet.
-        You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
+        You can also use ``pickle`` to do the job if you only work on Python.
+        The advantage of ``load` and ``save`` (as compared to ``pickle``) is that
+        the resulting file can be loaded from other MXNet language bindings.
+        One can also directly ``load``/``save` from/to cloud storage(S3, HDFS)
 
         Parameters
         ----------
@@ -851,7 +850,7 @@ def load(prefix, epoch, ctx=None, **kwargs):
         ctx : Context or list of Context, optional
             The device context of training and prediction.
         kwargs : dict
-            other parameters for model, including num_epoch, optimizer and numpy_batch_size
+            Other parameters for model, including ``num_epoch``, optimizer and ``numpy_batch_size``.
 
         Returns
         -------
@@ -878,49 +877,49 @@ def create(symbol, X, y=None, ctx=None,
                eval_end_callback=LogValidationMetricsCallback(),
                eval_batch_end_callback=None, **kwargs):
         """Functional style to create a model.
-        This function will be more consistent with functional
+        This function is more consistent with functional
         languages such as R, where mutation is not allowed.
 
         Parameters
         ----------
         symbol : Symbol
-            The symbol configuration of computation network.
+            The symbol configuration of a computation network.
         X : DataIter
-            Training data
+            Training data.
         y : numpy.ndarray, optional
-            If X is numpy.ndarray y is required to set
+            If X is a ``numpy.ndarray``, y must be set.
         ctx : Context or list of Context, optional
             The device context of training and prediction.
-            To use multi GPU training, pass in a list of gpu contexts.
+            To use multi-GPU training, pass in a list of GPU contexts.
         num_epoch : int, optional
-            Training parameter, number of training epochs(epochs).
+            The number of training epochs(epochs).
         epoch_size : int, optional
             Number of batches in a epoch. In default, it is set to
-            ceil(num_train_examples / batch_size)
+            ``ceil(num_train_examples / batch_size)``.
         optimizer : str or Optimizer, optional
-            Training parameter, name or optimizer object for training.
+            The name of the chosen optimizer, or an optimizer object, used for training.
         initializier : initializer function, optional
-            Training parameter, the initialization scheme used.
+            The initialization scheme used.
         eval_data : DataIter or numpy.ndarray pair
-            If eval_set is numpy.ndarray pair, it should be (valid_data, valid_label)
+            If ``eval_set`` is ``numpy.ndarray`` pair, it should
+            be (``valid_data``, ``valid_label``).
         eval_metric : metric.EvalMetric or str or callable
-            The evaluation metric, name of evaluation metric.
-            Or a customize evaluation function that returns the statistics
-            based on minibatch.
+            The evaluation metric. Can be the name of an evaluation metric
+            or a custom evaluation function that returns statistics
+            based on a minibatch.
         epoch_end_callback : callable(epoch, symbol, arg_params, aux_states)
             A callback that is invoked at end of each epoch.
             This can be used to checkpoint model each epoch.
         batch_end_callback: callable(epoch)
-            A callback that is invoked at end of each batch
-            For print purpose
+            A callback that is invoked at end of each batch for print purposes.
         kvstore: KVStore or str, optional
-           The KVStore or a string kvstore type: 'local', 'dist_sync', 'dis_async'
-           In default uses 'local', often no need to change for single machiine.
+           The KVStore or a string kvstore type: 'local', 'dist_sync', 'dis_async'.
+           Defaults to 'local', often no need to change for single machiine.
         logger : logging logger, optional
             When not specified, default logger will be used.
         work_load_list : list of float or int, optional
             The list of work load for different devices,
-            in the same order as ctx
+            in the same order as ctx.
         """
         model = FeedForward(symbol, ctx=ctx, num_epoch=num_epoch,
                             epoch_size=epoch_size,
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 860085c5668f..a5e7647a3229 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -1,8 +1,9 @@
 # pylint: disable=fixme, too-many-arguments, too-many-locals, too-many-public-methods, too-many-branches
 """`BaseModule` defines an API for modules."""
 
-import logging
 import time
+import logging
+import warnings
 
 from .. import metric
 from .. import ndarray
@@ -10,6 +11,8 @@
 from ..context import cpu
 from ..model import BatchEndParam
 from ..initializer import Uniform
+from ..io import DataDesc
+
 
 def _as_list(obj):
     """A utility function that treat the argument as a list.
@@ -28,12 +31,59 @@ def _as_list(obj):
         return [obj]
 
 
+def _check_input_names(symbol, names, typename, throw):
+    """Check that all input names are in symbol's argument"""
+    args = symbol.list_arguments()
+    for name in names:
+        if name in args:
+            continue
+        candidates = [arg for arg in args if
+                      not arg.endswith('_weight') and
+                      not arg.endswith('_bias') and
+                      not arg.endswith('_gamma') and
+                      not arg.endswith('_beta')]
+        msg = "\033[91mYou created Module with Module(..., %s_names=%s) but " \
+              "input with name '%s' is not found in symbol.list_arguments(). " \
+              "Did you mean one of:\n\t%s\033[0m"%(
+                  typename, str(names), name, '\n\t'.join(candidates))
+        if throw:
+            raise ValueError(msg)
+        else:
+            warnings.warn(msg)
+
+
+def _check_names_match(data_names, data_shapes, name, throw):
+    """Check that input names matches input data descriptors"""
+    actual = [x[0] for x in data_shapes]
+    if data_names != actual:
+        msg = "Data provided by %s_shapes don't match names specified by %s_names (%s vs. %s)"%(
+            name, name, str(data_shapes), str(data_names))
+        if throw:
+            raise ValueError(msg)
+        else:
+            warnings.warn(msg)
+
+
+def _parse_data_desc(data_names, label_names, data_shapes, label_shapes):
+    """parse data_shapes into DataDesc format and check that names match"""
+    data_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in data_shapes]
+    _check_names_match(data_names, data_shapes, 'data', True)
+    if label_shapes is not None:
+        label_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in label_shapes]
+        _check_names_match(label_names, label_shapes, 'label', False)
+    else:
+        _check_names_match(label_names, [], 'label', False)
+    return data_shapes, label_shapes
+
+
 class BaseModule(object):
-    """The base class of a modules. A module represents a computation component. The design
-    purpose of a module is that it abstract a computation "machine", that one can run forward,
-    backward, update parameters, etc. We aim to make the APIs easy to use, especially in the
-    case when we need to use imperative API to work with multiple modules (e.g. stochastic
-    depth network).
+    """The base class of a modules.
+
+    A module represents a computation component. The design purpose of a module
+    is that it abstract a computation "machine", that one can run forward,
+    backward, update parameters, etc. We aim to make the APIs easy to use,
+    especially in the case when we need to use imperative API to work with
+    multiple modules (e.g. stochastic depth network).
 
     A module has several states:
 
@@ -133,6 +183,7 @@ def __init__(self, logger=logging):
         self.params_initialized = False
         self.optimizer_initialized = False
         self._symbol = None
+        self._total_exec_bytes = 0
 
     ################################################################################
     # High Level API
@@ -244,6 +295,19 @@ def predict(self, eval_data, num_batch=None, merge_batches=True, reset=True,
                 always_output_list=False):
         """Run prediction and collect the outputs.
 
+        When `merge_batches` is `True` (by default), the return value will be a list
+        `[out1, out2, out3]`.  Where each element is concatenation of the outputs for
+        all the mini-batches. If further that `always_output_list` is `False` (by default),
+        then in the case of a single output, `out1` is returned instead of `[out1]`.
+
+        When `merge_batches` is `False`, the return value will be a nested list like
+        `[[out1_batch1, out2_batch1], [out1_batch2], ...]`. This mode is useful because
+        in some cases (e.g. bucketing), the module does not necessarily produce the same
+        number of outputs.
+
+        The objects in the results are `NDArray`s. If you need to work with numpy array,
+        just call `.asnumpy()` on each of the `NDArray`.
+
         Parameters
         ----------
         eval_data : DataIter
@@ -259,24 +323,14 @@ def predict(self, eval_data, num_batch=None, merge_batches=True, reset=True,
 
         Returns
         -------
-        When `merge_batches` is `True` (by default), the return value will be a list
-        `[out1, out2, out3]`.  Where each element is concatenation of the outputs for
-        all the mini-batches. If further that `always_output_list` is `False` (by default),
-        then in the case of a single output, `out1` is returned instead of `[out1]`.
-
-        When `merge_batches` is `False`, the return value will be a nested list like
-        `[[out1_batch1, out2_batch1], [out1_batch2], ...]`. This mode is useful because
-        in some cases (e.g. bucketing), the module does not necessarily produce the same
-        number of outputs.
-
-        The objects in the results are `NDArray`s. If you need to work with numpy array,
-        just call `.asnumpy()` on each of the `NDArray`.
+        list of NDArray or list of list of NDArray
+            Predict results
 
         Examples
         --------
         An example of using predict for prediction::
-            >>> #Predict on the first 10 batches of val_dataiter
-            >>> mod.predict(eval_data=val_dataiter, num_batch=10)
+        >>> #Predict on the first 10 batches of val_dataiter
+        >>> mod.predict(eval_data=val_dataiter, num_batch=10)
         """
         assert self.binded and self.params_initialized
 
@@ -329,7 +383,9 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
             If not `None`, will be used as validation set and evaluate the performance
             after each epoch.
         eval_metric : str or EvalMetric
-            Default `'acc'`. The performance measure used to display during training.
+            Default `'accuracy'`. The performance measure used to display during training.
+            Other possible predefined metrics are:
+            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'
         epoch_end_callback : function or list of function
             Each callback will be called with the current `epoch`, `symbol`, `arg_params`
             and `aux_params`.
@@ -404,11 +460,23 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
         for epoch in range(begin_epoch, num_epoch):
             tic = time.time()
             eval_metric.reset()
-            for nbatch, data_batch in enumerate(train_data):
+            nbatch = 0
+            data_iter = iter(train_data)
+            end_of_batch = False
+            next_data_batch = next(data_iter)
+            while not end_of_batch:
+                data_batch = next_data_batch
                 if monitor is not None:
                     monitor.tic()
                 self.forward_backward(data_batch)
                 self.update()
+                try:
+                    # pre fetch next batch
+                    next_data_batch = next(data_iter)
+                    self.prepare(next_data_batch)
+                except StopIteration:
+                    end_of_batch = True
+
                 self.update_metric(eval_metric, data_batch.label)
 
                 if monitor is not None:
@@ -420,6 +488,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
                                                      locals=locals())
                     for callback in _as_list(batch_end_callback):
                         callback(batch_end_params)
+                nbatch += 1
 
             # one epoch of training is finished
             for name, val in eval_metric.get_name_value():
@@ -492,15 +561,16 @@ def get_params(self):
 
         Returns
         -------
-        `(arg_params, aux_params)`, a pair of dictionary of name to value mapping.
+        `(arg_params, aux_params)`
+            a pair of dictionary of name to value mapping.
 
         Examples
         --------
         An example of getting module parameters::
-            >>> print mod.get_params()
-            ({'fc2_weight': <NDArray 64x128 @cpu(0)>, 'fc1_weight': <NDArray 128x100 @cpu(0)>,
-            'fc3_bias': <NDArray 10 @cpu(0)>, 'fc3_weight': <NDArray 10x64 @cpu(0)>,
-            'fc2_bias': <NDArray 64 @cpu(0)>, 'fc1_bias': <NDArray 128 @cpu(0)>}, {})
+        >>> print mod.get_params()
+        ({'fc2_weight': <NDArray 64x128 @cpu(0)>, 'fc1_weight': <NDArray 128x100 @cpu(0)>,
+        'fc3_bias': <NDArray 10 @cpu(0)>, 'fc3_weight': <NDArray 10x64 @cpu(0)>,
+        'fc2_bias': <NDArray 64 @cpu(0)>, 'fc1_bias': <NDArray 128 @cpu(0)>}, {})
         """
         raise NotImplementedError()
 
@@ -600,6 +670,44 @@ def load_params(self, fname):
                 raise ValueError("Invalid param file " + fname)
         self.set_params(arg_params, aux_params)
 
+    def get_states(self, merge_multi_context=True):
+        """Get states from all devices
+
+        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
+        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
+        elements are `NDArray`.
+
+        Parameters
+        ----------
+        merge_multi_context : bool
+            Default is `True`. In the case when data-parallelism is used, the states
+            will be collected from multiple devices. A `True` value indicate that we
+            should merge the collected results so that they look like from a single
+            executor.
+
+        Returns
+        -------
+        list of NDArray or list of list of NDArray
+            States
+        """
+        assert self.binded and self.params_initialized
+        assert not merge_multi_context
+        return []
+
+    def set_states(self, states=None, value=None):
+        """Set value for states. Only one of states & value can be specified.
+
+        Parameters
+        ----------
+        states : list of list of NDArrays
+            source states arrays formatted like [[state1_dev1, state1_dev2],
+            [state2_dev1, state2_dev2]].
+        value : number
+            a single scalar value for all state arrays.
+        """
+        assert self.binded and self.params_initialized
+        assert not states and not value
+
     def install_monitor(self, mon):
         """Install monitor on all executors"""
         raise NotImplementedError()
@@ -607,6 +715,17 @@ def install_monitor(self, mon):
     ################################################################################
     # Computations
     ################################################################################
+    def prepare(self, data_batch):
+        '''Prepare the module for processing a data batch.
+
+        Usually involves switching bucket and reshaping.
+
+        Parameters
+        ----------
+        data_batch : DataBatch
+        '''
+        pass
+
     def forward(self, data_batch, is_train=None):
         """Forward computation.
 
@@ -659,6 +778,11 @@ def backward(self, out_grads=None):
     def get_outputs(self, merge_multi_context=True):
         """Get outputs of the previous forward computation.
 
+        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
+        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
+        elements are `NDArray`. When `merge_multi_context` is `False`, those `NDArray`
+        might live on different devices.
+
         Parameters
         ----------
         merge_multi_context : bool
@@ -669,10 +793,8 @@ def get_outputs(self, merge_multi_context=True):
 
         Returns
         -------
-        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
-        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
-        elements are `NDArray`. When `merge_multi_context` is `False`, those `NDArray`
-        might live on different devices.
+        list of NDArray or list of list of NDArray
+            Output
 
         Examples
         --------
@@ -686,6 +808,11 @@ def get_outputs(self, merge_multi_context=True):
     def get_input_grads(self, merge_multi_context=True):
         """Get the gradients to the inputs, computed in the previous backward computation.
 
+        If `merge_multi_context` is `True`, it is like `[grad1, grad2]`. Otherwise, it
+        is like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`. All the output
+        elements are `NDArray`. When `merge_multi_context` is `False`, those `NDArray`
+        might live on different devices.
+
         Parameters
         ----------
         merge_multi_context : bool
@@ -696,10 +823,8 @@ def get_input_grads(self, merge_multi_context=True):
 
         Returns
         -------
-        If `merge_multi_context` is `True`, it is like `[grad1, grad2]`. Otherwise, it
-        is like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`. All the output
-        elements are `NDArray`. When `merge_multi_context` is `False`, those `NDArray`
-        might live on different devices.
+        list of NDArray or list of list of NDArray
+              Input gradients
 
         Examples
         --------
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index a0169a6765f6..35edb3525df6 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -1,16 +1,18 @@
-# pylint: disable=too-many-instance-attributes, too-many-arguments
+# pylint: disable=too-many-instance-attributes, too-many-arguments, protected-access
+# pylint: disable=too-many-public-methods
 """A `BucketingModule` implement the `BaseModule` API, and allows multiple
 symbols to be used depending on the `bucket_key` provided by each different
 mini-batch of data.
 """
 
 import logging
+import warnings
 
 from .. import context as ctx
 
 from ..initializer import Uniform
 
-from .base_module import BaseModule
+from .base_module import BaseModule, _check_input_names
 from .module import Module
 
 class BucketingModule(BaseModule):
@@ -28,16 +30,34 @@ class BucketingModule(BaseModule):
         Default `cpu()`
     work_load_list : list of number
         Default `None`, indicating uniform workload.
+    fixed_param_names: list of str
+        Default `None`, indicating no network parameters are fixed.
+    state_names : list of str
+        states are similar to data and label, but not provided by data iterator.
+        Instead they are initialized to 0 and can be set by set_states()
     """
-    def __init__(self, sym_gen, default_bucket_key=None,
-                 logger=logging, context=ctx.cpu(), work_load_list=None,
-                 fixed_param_prefix=None):
+    def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
+                 context=ctx.cpu(), work_load_list=None,
+                 fixed_param_names=None, state_names=None):
         super(BucketingModule, self).__init__(logger=logger)
 
         assert default_bucket_key is not None
         self._default_bucket_key = default_bucket_key
-
         self._sym_gen = sym_gen
+
+        symbol, data_names, label_names = sym_gen(default_bucket_key)
+        data_names = list(data_names) if data_names is not None else []
+        label_names = list(label_names) if label_names is not None else []
+        state_names = list(state_names) if state_names is not None else []
+        fixed_param_names = list(fixed_param_names) if fixed_param_names is not None else []
+
+        _check_input_names(symbol, data_names, "data", True)
+        _check_input_names(symbol, label_names, "label", False)
+        _check_input_names(symbol, state_names, "state", True)
+        _check_input_names(symbol, fixed_param_names, "fixed_param", True)
+
+        self._fixed_param_names = fixed_param_names
+        self._state_names = state_names
         self._context = context
         self._work_load_list = work_load_list
         self._fixed_param_prefix = fixed_param_prefix
@@ -46,12 +66,15 @@ def __init__(self, sym_gen, default_bucket_key=None,
 
         self._buckets = {}
         self._curr_module = None
+        self._curr_bucket_key = None
+        self._params_dirty = False
 
     def _reset_bind(self):
         """Internal utility function to reset binding."""
         self.binded = False
         self._buckets = {}
         self._curr_module = None
+        self._curr_bucket_key = None
 
     @property
     def data_names(self):
@@ -111,7 +134,49 @@ def get_params(self):
         `NDArray`) mapping.
         """
         assert self.binded and self.params_initialized
-        return self._curr_module.get_params()
+        self._curr_module._params_dirty = self._params_dirty
+        params = self._curr_module.get_params()
+        self._params_dirty = False
+        return params
+
+    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True):
+        """Assign parameter and aux state values.
+
+        Parameters
+        ----------
+        arg_params : dict
+            Dictionary of name to value (`NDArray`) mapping.
+        aux_params : dict
+            Dictionary of name to value (`NDArray`) mapping.
+        allow_missing : bool
+            If true, params could contain missing values, and the initializer will be
+            called to fill those missing params.
+        force_init : bool
+            If true, will force re-initialize even if already initialized.
+
+        Examples
+        --------
+        An example of setting module parameters::
+            >>> sym, arg_params, aux_params = \
+            >>>     mx.model.load_checkpoint(model_prefix, n_epoch_load)
+            >>> mod.set_params(arg_params=arg_params, aux_params=aux_params)
+        """
+        if not allow_missing:
+            self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
+                             allow_missing=allow_missing, force_init=force_init)
+            return
+
+        if self.params_initialized and not force_init:
+            warnings.warn("Parameters already initialized and force_init=False. "
+                          "set_params call ignored.", stacklevel=2)
+            return
+
+        self._curr_module.set_params(arg_params, aux_params, allow_missing=allow_missing,
+                                     force_init=force_init)
+
+        # because we didn't update self._arg_params, they are dirty now.
+        self._params_dirty = True
+        self.params_initialized = True
 
     def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
                     allow_missing=False, force_init=False):
@@ -136,8 +201,43 @@ def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=Non
         self._curr_module.init_params(initializer=initializer, arg_params=arg_params,
                                       aux_params=aux_params, allow_missing=allow_missing,
                                       force_init=force_init)
+        self._params_dirty = False
         self.params_initialized = True
 
+    def get_states(self, merge_multi_context=True):
+        """Get states from all devices
+
+        Parameters
+        ----------
+        merge_multi_context : bool
+            Default is `True`. In the case when data-parallelism is used, the states
+            will be collected from multiple devices. A `True` value indicate that we
+            should merge the collected results so that they look like from a single
+            executor.
+
+        Returns
+        -------
+        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
+        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
+        elements are `NDArray`.
+        """
+        assert self.binded and self.params_initialized
+        return self._curr_module.get_states(merge_multi_context=merge_multi_context)
+
+    def set_states(self, states=None, value=None):
+        """Set value for states. Only one of states & value can be specified.
+
+        Parameters
+        ----------
+        states : list of list of NDArrays
+            source states arrays formatted like [[state1_dev1, state1_dev2],
+            [state2_dev1, state2_dev2]].
+        value : number
+            a single scalar value for all state arrays.
+        """
+        assert self.binded and self.params_initialized
+        self._curr_module.set_states(states, value)
+
     def bind(self, data_shapes, label_shapes=None, for_training=True,
              inputs_need_grad=False, force_rebind=False, shared_module=None,
              grad_req='write'):
@@ -163,6 +263,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
             Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
             (default to 'write').
             Can be specified globally (str) or for each argument (list, dict).
+        bucket_key : str (or any python object)
+            bucket key for binding. by default use the default_bucket_key
         """
         # in case we already initialized params, keep it
         if self.params_initialized:
@@ -185,10 +287,13 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
 
         symbol, data_names, label_names = self._sym_gen(self._default_bucket_key)
         module = Module(symbol, data_names, label_names, logger=self.logger,
-                        context=self._context, work_load_list=self._work_load_list)
+                        context=self._context, work_load_list=self._work_load_list,
+                        fixed_param_names=self._fixed_param_names,
+                        state_names=self._state_names)
         module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
                     force_rebind=False, shared_module=None, grad_req=grad_req)
         self._curr_module = module
+        self._curr_bucket_key = self._default_bucket_key
         self._buckets[self._default_bucket_key] = module
 
         # copy back saved params, if already initialized
@@ -218,13 +323,15 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
             module = Module(symbol, data_names, label_names,
                             logger=self.logger, context=self._context,
                             work_load_list=self._work_load_list,
-                            fixed_param_names=fixed_param_names)
+                            fixed_param_names=self._fixed_param_names,
+                            state_names=self._state_names)
             module.bind(data_shapes, label_shapes, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad,
-                        force_rebind=False, shared_module=self._curr_module)
+                        force_rebind=False, shared_module=self._buckets[self._default_bucket_key])
             self._buckets[bucket_key] = module
 
         self._curr_module = self._buckets[bucket_key]
+        self._curr_bucket_key = bucket_key
 
     def init_optimizer(self, kvstore='local', optimizer='sgd',
                        optimizer_params=(('learning_rate', 0.01),),
@@ -257,6 +364,23 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
 
         self.optimizer_initialized = True
 
+    def prepare(self, data_batch):
+        '''Prepare a data batch for forward.
+
+        Parameters
+        ----------
+        data_batch : DataBatch
+        '''
+        # perform bind if haven't done so
+        assert self.binded and self.params_initialized
+        bucket_key = data_batch.bucket_key
+        original_bucket_key = self._curr_bucket_key
+        data_shapes = data_batch.provide_data
+        label_shapes = data_batch.provide_label
+        self.switch_bucket(bucket_key, data_shapes, label_shapes)
+        # switch back
+        self.switch_bucket(original_bucket_key, None, None)
+
     def forward(self, data_batch, is_train=None):
         """Forward computation.
 
@@ -281,6 +405,7 @@ def update(self):
         in the previous forward-backward cycle.
         """
         assert self.binded and self.params_initialized and self.optimizer_initialized
+        self._params_dirty = True
         self._curr_module.update()
 
     def get_outputs(self, merge_multi_context=True):
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 391cca65a3ba..144bf59de4c8 100644
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -16,6 +16,9 @@ def _load_general(data, targets, major_axis):
     for d_src, d_targets, axis in zip(data, targets, major_axis):
         if isinstance(d_targets, nd.NDArray):
             d_src.copyto(d_targets)
+        elif isinstance(d_src, (list, tuple)):
+            for src, dst in zip(d_src, d_targets):
+                src.copyto(dst)
         else:
             for slice_idx, d_dst in d_targets:
                 if axis >= 0:
@@ -54,7 +57,15 @@ def _merge_multi_context(outputs, major_axis):
     rets = []
     for tensors, axis in zip(outputs, major_axis):
         if axis >= 0:
-            rets.append(nd.concatenate(tensors, axis=axis, always_copy=False))
+            # pylint: disable=no-member,protected-access
+            if len(tensors) == 1:
+                rets.append(tensors[0])
+            else:
+                # Concatenate if necessary
+                rets.append(nd.concat(*[tensor.as_in_context(tensors[0].context)
+                                        for tensor in tensors],
+                                      dim=axis))
+            # pylint: enable=no-member,protected-access
         else:
             # negative axis means the there is no batch_size axis, and all the
             # results should be the same on each device. We simply take the
@@ -108,8 +119,8 @@ class DataParallelExecutorGroup(object):
         Can be specified globally (str) or for each argument (list, dict).
     """
     def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_names,
-                 for_training, inputs_need_grad, shared_group=None,
-                 logger=logging, fixed_param_names=None, grad_req='write'):
+                 for_training, inputs_need_grad, shared_group=None, logger=logging,
+                 fixed_param_names=None, grad_req='write', state_names=None):
         self.param_names = param_names
         self.arg_names = symbol.list_arguments()
         self.aux_names = symbol.list_auxiliary_states()
@@ -122,11 +133,16 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
         self.inputs_need_grad = inputs_need_grad
 
         self.logger = logger
-
+        #In the future we should have a better way to profile memory per device (haibin)
+        self._total_exec_bytes = 0
         self.fixed_param_names = fixed_param_names
         if self.fixed_param_names is None:
             self.fixed_param_names = []
 
+        self.state_names = state_names
+        if self.state_names is None:
+            self.state_names = []
+
         if not for_training:
             grad_req = 'null'
 
@@ -170,9 +186,11 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
         self.batch_size = None
         self.slices = None
         self.execs = []
+        self._default_execs = None
         self.data_arrays = None
         self.label_arrays = None
         self.param_arrays = None
+        self.state_arrays = None
         self.grad_arrays = None
         self.aux_arrays = None
         self.input_grad_arrays = None
@@ -217,6 +235,10 @@ def _collect_arrays(self):
         # convenient data structures
         self.data_arrays = [[(self.slices[i], e.arg_dict[name]) for i, e in enumerate(self.execs)]
                             for name, _ in self.data_shapes]
+
+        self.state_arrays = [[e.arg_dict[name] for e in self.execs]
+                             for name in self.state_names]
+
         if self.label_shapes is not None:
             self.label_arrays = [[(self.slices[i], e.arg_dict[name])
                                   for i, e in enumerate(self.execs)]
@@ -272,8 +294,8 @@ def bind_exec(self, data_shapes, label_shapes, shared_group=None, reshape=False)
                 label_shapes_i = []
 
             if reshape:
-                self.execs[i] = self.execs[i].reshape(allow_up_sizing=True,
-                                                      **dict(data_shapes_i + label_shapes_i))
+                self.execs[i] = self._default_execs[i].reshape(
+                    allow_up_sizing=True, **dict(data_shapes_i + label_shapes_i))
             else:
                 self.execs.append(self._bind_ith_exec(i, data_shapes_i, label_shapes_i,
                                                       shared_group))
@@ -292,6 +314,8 @@ def reshape(self, data_shapes, label_shapes):
         """
         if data_shapes == self.data_shapes and label_shapes == self.label_shapes:
             return
+        if self._default_execs is None:
+            self._default_execs = [i for i in self.execs]
         self.bind_exec(data_shapes, label_shapes, reshape=True)
 
     def set_params(self, arg_params, aux_params):
@@ -390,6 +414,48 @@ def get_outputs(self, merge_multi_context=True):
             outputs = _merge_multi_context(outputs, self.output_layouts)
         return outputs
 
+    def get_states(self, merge_multi_context=True):
+        """Get states from all devices
+
+        Parameters
+        ----------
+        merge_multi_context : bool
+            Default is `True`. In the case when data-parallelism is used, the states
+            will be collected from multiple devices. A `True` value indicate that we
+            should merge the collected results so that they look like from a single
+            executor.
+
+        Returns
+        -------
+        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
+        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
+        elements are `NDArray`.
+        """
+        assert not merge_multi_context, \
+            "merge_multi_context=True is not supported for get_states yet."
+        return self.state_arrays
+
+    def set_states(self, states=None, value=None):
+        """Set value for states. Only one of states & value can be specified.
+
+        Parameters
+        ----------
+        states : list of list of NDArrays
+            source states arrays formatted like [[state1_dev1, state1_dev2],
+            [state2_dev1, state2_dev2]].
+        value : number
+            a single scalar value for all state arrays.
+        """
+        if states is not None:
+            assert value is None, "Only one of states & value can be specified."
+            _load_general(states, self.state_arrays, (0,)*len(states))
+        else:
+            assert value is not None, "At least one of states & value must be specified."
+            assert states is None, "Only one of states & value can be specified."
+            for d_dst in self.state_arrays:
+                for dst in d_dst:
+                    dst[:] = value
+
     def get_input_grads(self, merge_multi_context=True):
         """Get the gradients with respect to the inputs of the module.
 
@@ -522,7 +588,7 @@ def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logg
         # create or borrow arguments and gradients
         for j in range(len(self.arg_names)):
             name = self.arg_names[j]
-            if name in self.param_names: # model parameter
+            if name in self.param_names: # model parameters
                 if shared_exec is None:
                     arg_arr = nd.zeros(arg_shapes[j], context, dtype=arg_types[j])
                     if self.grad_req[name] != 'null':
@@ -534,7 +600,7 @@ def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logg
                     assert arg_arr.dtype == arg_types[j]
                     if self.grad_req[name] != 'null':
                         grad_arrays[name] = shared_exec.grad_dict[name]
-            else: # data or label
+            else: # data, label, or states
                 arg_arr = _get_or_reshape(name, shared_data_arrays, arg_shapes[j], arg_types[j],
                                           context, self.logger)
 
@@ -558,6 +624,8 @@ def _get_or_reshape(name, shared_data_arrays, arg_shape, arg_type, context, logg
         executor = self.symbol.bind(ctx=context, args=arg_arrays,
                                     args_grad=grad_arrays, aux_states=aux_arrays,
                                     grad_req=self.grad_req, shared_exec=shared_exec)
+        # Get the total bytes allocated for this executor
+        self._total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1])
         return executor
 
     def _sliced_shape(self, shapes, i, major_axis):
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index a3522e169683..0f416703c276 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -5,6 +5,7 @@
 """
 
 import logging
+import warnings
 
 from .. import context as ctx
 from .. import ndarray as nd
@@ -15,8 +16,8 @@
 from ..model import load_checkpoint
 from ..initializer import Uniform, InitDesc
 
-from .base_module import BaseModule
-from ..io import DataDesc
+from .base_module import BaseModule, _check_input_names, _parse_data_desc
+
 
 class Module(BaseModule):
     """Module is a basic module that wrap a `Symbol`. It is functionally the same
@@ -38,9 +39,13 @@ class Module(BaseModule):
         Default `None`, indicating uniform workload.
     fixed_param_names: list of str
         Default `None`, indicating no network parameters are fixed.
+    state_names : list of str
+        states are similar to data and label, but not provided by data iterator.
+        Instead they are initialized to 0 and can be set by set_states()
     """
     def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
-                 logger=logging, context=ctx.cpu(), work_load_list=None, fixed_param_names=None):
+                 logger=logging, context=ctx.cpu(), work_load_list=None,
+                 fixed_param_names=None, state_names=None):
         super(Module, self).__init__(logger=logger)
 
         if isinstance(context, ctx.Context):
@@ -53,16 +58,24 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
 
         self._symbol = symbol
 
-        data_names = list(data_names)
+        data_names = list(data_names) if data_names is not None else []
         label_names = list(label_names) if label_names is not None else []
+        state_names = list(state_names) if state_names is not None else []
+        fixed_param_names = list(fixed_param_names) if fixed_param_names is not None else []
+
+        _check_input_names(symbol, data_names, "data", True)
+        _check_input_names(symbol, label_names, "label", False)
+        _check_input_names(symbol, state_names, "state", True)
+        _check_input_names(symbol, fixed_param_names, "fixed_param", True)
 
         arg_names = symbol.list_arguments()
-        input_names = data_names + label_names
+        input_names = data_names + label_names + state_names
         self._param_names = [x for x in arg_names if x not in input_names]
         self._fixed_param_names = fixed_param_names
         self._aux_names = symbol.list_auxiliary_states()
         self._data_names = data_names
         self._label_names = label_names
+        self._state_names = state_names
         self._output_names = symbol.list_outputs()
 
         self._arg_params = None
@@ -153,6 +166,11 @@ def data_names(self):
         """A list of names for data required by this module."""
         return self._data_names
 
+    @property
+    def label_names(self):
+        """A list of names for labels required by this module."""
+        return self._label_names
+
     @property
     def output_names(self):
         """A list of names for the outputs of this module."""
@@ -161,6 +179,7 @@ def output_names(self):
     @property
     def data_shapes(self):
         """Get data shapes.
+
         Returns
         -------
         A list of `(name, shape)` pairs.
@@ -171,11 +190,12 @@ def data_shapes(self):
     @property
     def label_shapes(self):
         """Get label shapes.
+
         Returns
         -------
-        A list of `(name, shape)` pairs. The return value could be `None` if
-        the module does not need labels, or if the module is not binded for
-        training (in this case, label information is not available).
+            A list of `(name, shape)` pairs. The return value could be `None` if
+            the module does not need labels, or if the module is not binded for
+            training (in this case, label information is not available).
         """
         assert self.binded
         return self._label_shapes
@@ -183,6 +203,7 @@ def label_shapes(self):
     @property
     def output_shapes(self):
         """Get output shapes.
+
         Returns
         -------
         A list of `(name, shape)` pairs.
@@ -224,23 +245,11 @@ def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=Non
             If true, will force re-initialize even if already initialized.
         """
         if self.params_initialized and not force_init:
+            warnings.warn("Parameters already initialized and force_init=False. "
+                          "init_params call ignored.", stacklevel=2)
             return
         assert self.binded, 'call bind before initializing the parameters'
 
-        if self._arg_params is None:
-            param_arrays = [
-                nd.zeros(x[0].shape, dtype=x[0].dtype)
-                for x in self._exec_group.param_arrays
-            ]
-            self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)}
-
-        if self._aux_params is None:
-            aux_arrays = [
-                nd.zeros(x[0].shape, dtype=x[0].dtype)
-                for x in self._exec_group.aux_arrays
-            ]
-            self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)}
-
         def _impl(name, arr, cache):
             """Internal helper for parameter initialization"""
             if cache is not None:
@@ -253,7 +262,7 @@ def _impl(name, arr, cache):
                 else:
                     if not allow_missing:
                         raise RuntimeError("%s is not presented" % name)
-                    if initializer != None:
+                    if initializer is not None:
                         initializer(name, arr)
             else:
                 initializer(name, arr)
@@ -273,6 +282,44 @@ def _impl(name, arr, cache):
         # copy the initialized parameters to devices
         self._exec_group.set_params(self._arg_params, self._aux_params)
 
+    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True):
+        """Assign parameter and aux state values.
+
+        Parameters
+        ----------
+        arg_params : dict
+            Dictionary of name to value (`NDArray`) mapping.
+        aux_params : dict
+            Dictionary of name to value (`NDArray`) mapping.
+        allow_missing : bool
+            If true, params could contain missing values, and the initializer will be
+            called to fill those missing params.
+        force_init : bool
+            If true, will force re-initialize even if already initialized.
+
+        Examples
+        --------
+        An example of setting module parameters::
+            >>> sym, arg_params, aux_params = \
+            >>>     mx.model.load_checkpoint(model_prefix, n_epoch_load)
+            >>> mod.set_params(arg_params=arg_params, aux_params=aux_params)
+        """
+        if not allow_missing:
+            self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
+                             allow_missing=allow_missing, force_init=force_init)
+            return
+
+        if self.params_initialized and not force_init:
+            warnings.warn("Parameters already initialized and force_init=False. "
+                          "set_params call ignored.", stacklevel=2)
+            return
+
+        self._exec_group.set_params(arg_params, aux_params)
+
+        # because we didn't update self._arg_params, they are dirty now.
+        self._params_dirty = True
+        self.params_initialized = True
+
     def bind(self, data_shapes, label_shapes=None, for_training=True,
              inputs_need_grad=False, force_rebind=False, shared_module=None,
              grad_req='write'):
@@ -321,13 +368,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
             # that consumes the labels
             # assert label_shapes is not None
 
-        self._data_shapes = \
-            [x if isinstance(x, DataDesc) else DataDesc(*x) for x in data_shapes]
-        if label_shapes is not None:
-            self._label_shapes = \
-                [x if isinstance(x, DataDesc) else DataDesc(*x) for x in label_shapes]
-        else:
-            self._label_shapes = None
+        self._data_shapes, self._label_shapes = _parse_data_desc(
+            self.data_names, self.label_names, data_shapes, label_shapes)
 
         if shared_module is not None:
             assert isinstance(shared_module, Module) and \
@@ -342,7 +384,9 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                                                      for_training, inputs_need_grad,
                                                      shared_group, logger=self.logger,
                                                      fixed_param_names=self._fixed_param_names,
-                                                     grad_req=grad_req)
+                                                     grad_req=grad_req,
+                                                     state_names=self._state_names)
+        self._total_exec_bytes = self._exec_group._total_exec_bytes
         if shared_module is not None:
             self.params_initialized = True
             self._arg_params = shared_module._arg_params
@@ -351,10 +395,24 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
             # if the parameters are already initialized, we are re-binding
             # so automatically copy the already initialized params
             self._exec_group.set_params(self._arg_params, self._aux_params)
+        else:
+            assert self._arg_params is None and self._aux_params is None
+            param_arrays = [
+                nd.zeros(x[0].shape, dtype=x[0].dtype)
+                for x in self._exec_group.param_arrays
+            ]
+            self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)}
+
+            aux_arrays = [
+                nd.zeros(x[0].shape, dtype=x[0].dtype)
+                for x in self._exec_group.aux_arrays
+            ]
+            self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)}
 
         if shared_module is not None and shared_module.optimizer_initialized:
             self.borrow_optimizer(shared_module)
 
+
     def reshape(self, data_shapes, label_shapes=None):
         """Reshape the module for new input shapes.
 
@@ -366,13 +424,8 @@ def reshape(self, data_shapes, label_shapes=None):
             Typically is `data_iter.provide_label`.
         """
         assert self.binded
-        self._data_shapes = \
-            [x if isinstance(x, DataDesc) else DataDesc(*x) for x in data_shapes]
-        if label_shapes is not None:
-            self._label_shapes = \
-                [x if isinstance(x, DataDesc) else DataDesc(*x) for x in label_shapes]
-        else:
-            self._label_shapes = None
+        self._data_shapes, self._label_shapes = _parse_data_desc(
+            self.data_names, self.label_names, data_shapes, label_shapes)
 
         self._exec_group.reshape(self._data_shapes, self._label_shapes)
 
@@ -399,13 +452,17 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
             self.logger.warning('optimizer already initialized, ignoring...')
             return
 
+        if self._params_dirty:
+            self._sync_params_from_devices()
         (kvstore, update_on_kvstore) = \
                 _create_kvstore(kvstore, len(self._context), self._arg_params)
 
+        batch_size = self._exec_group.batch_size
+        if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type:
+            batch_size *= kvstore.num_workers
+        rescale_grad = 1.0/batch_size
+
         if isinstance(optimizer, str):
-            batch_size = self._exec_group.batch_size
-            if kvstore and kvstore.type == 'dist_sync':
-                batch_size *= kvstore.num_workers
             idx2name = {}
             if update_on_kvstore:
                 idx2name.update(enumerate(self._exec_group.param_names))
@@ -415,12 +472,19 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
                                      for i, n in enumerate(self._exec_group.param_names)})
             optimizer_params = dict(optimizer_params)
             if 'rescale_grad' not in optimizer_params:
-                optimizer_params['rescale_grad'] = 1.0/batch_size
+                optimizer_params['rescale_grad'] = rescale_grad
             optimizer = opt.create(optimizer,
                                    sym=self.symbol, param_idx2name=idx2name,
                                    **optimizer_params)
         else:
             assert isinstance(optimizer, opt.Optimizer)
+            if optimizer.rescale_grad != rescale_grad:
+                #pylint: disable=no-member
+                warnings.warn(
+                    "Optimizer created manually outside Module but rescale_grad " +
+                    "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%(
+                        optimizer.rescale_grad, rescale_grad) +
+                    "Is this intended?", stacklevel=2)
 
         self._optimizer = optimizer
         self._kvstore = kvstore
@@ -507,6 +571,11 @@ def update(self):
     def get_outputs(self, merge_multi_context=True):
         """Get outputs of the previous forward computation.
 
+        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
+        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
+        elements are `NDArray`. When `merge_multi_context` is `False`, those `NDArray`
+        might live on different devices.
+
         Parameters
         ----------
         merge_multi_context : bool
@@ -517,9 +586,8 @@ def get_outputs(self, merge_multi_context=True):
 
         Returns
         -------
-        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
-        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
-        elements are `NDArray`.
+        list of NDArray or list of list of NDArray
+            Output
         """
         assert self.binded and self.params_initialized
         return self._exec_group.get_outputs(merge_multi_context=merge_multi_context)
@@ -527,6 +595,10 @@ def get_outputs(self, merge_multi_context=True):
     def get_input_grads(self, merge_multi_context=True):
         """Get the gradients with respect to the inputs of the module.
 
+        If `merge_multi_context` is `True`, it is like `[grad1, grad2]`. Otherwise, it
+        is like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`. All the output
+        elements are `NDArray`.
+
         Parameters
         ----------
         merge_multi_context : bool
@@ -537,13 +609,49 @@ def get_input_grads(self, merge_multi_context=True):
 
         Returns
         -------
-        If `merge_multi_context` is `True`, it is like `[grad1, grad2]`. Otherwise, it
-        is like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`. All the output
-        elements are `NDArray`.
+        list of NDArray or list of list of NDArray
+              Input gradients
         """
         assert self.binded and self.params_initialized and self.inputs_need_grad
         return self._exec_group.get_input_grads(merge_multi_context=merge_multi_context)
 
+    def get_states(self, merge_multi_context=True):
+        """Get states from all devices
+
+        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
+        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
+        elements are `NDArray`.
+
+        Parameters
+        ----------
+        merge_multi_context : bool
+            Default is `True`. In the case when data-parallelism is used, the states
+            will be collected from multiple devices. A `True` value indicate that we
+            should merge the collected results so that they look like from a single
+            executor.
+
+        Returns
+        -------
+        list of NDArray or list of list of NDArray
+            States
+        """
+        assert self.binded and self.params_initialized
+        return self._exec_group.get_states(merge_multi_context=merge_multi_context)
+
+    def set_states(self, states=None, value=None):
+        """Set value for states. Only one of states & value can be specified.
+
+        Parameters
+        ----------
+        states : list of list of NDArrays
+            source states arrays formatted like [[state1_dev1, state1_dev2],
+            [state2_dev1, state2_dev2]].
+        value : number
+            a single scalar value for all state arrays.
+        """
+        assert self.binded and self.params_initialized
+        self._exec_group.set_states(states, value)
+
     def update_metric(self, eval_metric, labels):
         """Evaluate and accumulate evaluation metric on outputs of the last forward computation.
 
@@ -561,6 +669,7 @@ def _sync_params_from_devices(self):
         latest parameters from `self._arg_params` and `self._aux_params`.
         """
         self._exec_group.get_params(self._arg_params, self._aux_params)
+        self._params_dirty = False
 
     def save_optimizer_states(self, fname):
         """Save optimizer (updater) state to file
diff --git a/python/mxnet/module/sequential_module.py b/python/mxnet/module/sequential_module.py
index 9822c09aac74..b1b42ecb0f8a 100644
--- a/python/mxnet/module/sequential_module.py
+++ b/python/mxnet/module/sequential_module.py
@@ -10,6 +10,7 @@
 
 class SequentialModule(BaseModule):
     """A SequentialModule is a container module that can chain multiple modules together.
+
     Note building a computation graph with this kind of imperative container is less
     flexible and less efficient than the symbolic graph. So this should be only used as a
     handy utility.
@@ -48,8 +49,9 @@ def add(self, module, **kwargs):
 
         Returns
         -------
-        This function returns `self` to allow us to easily chain a
-        series of `add` calls.
+        self
+            This function returns `self` to allow us to easily chain a
+            series of `add` calls.
 
         Examples
         --------
@@ -61,7 +63,7 @@ def add(self, module, **kwargs):
         self._modules.append(module)
 
         # a sanity check to avoid typo
-        for key in kwargs.keys():
+        for key in kwargs:
             assert key in self._meta_keys, ('Unknown meta "%s", a typo?' % key)
 
         self._metas.append(kwargs)
@@ -91,10 +93,12 @@ def output_names(self):
     @property
     def data_shapes(self):
         """Get data shapes.
+
         Returns
         -------
-        A list of `(name, shape)` pairs. The data shapes of the
-        first module is the data shape of a `SequentialModule`.
+        list
+            A list of `(name, shape)` pairs. The data shapes of the first module
+            is the data shape of a `SequentialModule`.
         """
         assert self.binded
         return self._modules[0].data_shapes
@@ -102,11 +106,13 @@ def data_shapes(self):
     @property
     def label_shapes(self):
         """Get label shapes.
+
         Returns
         -------
-        A list of `(name, shape)` pairs. The return value could be `None` if
-        the module does not need labels, or if the module is not binded for
-        training (in this case, label information is not available).
+        list
+            A list of `(name, shape)` pairs. The return value could be `None` if
+            the module does not need labels, or if the module is not binded for
+            training (in this case, label information is not available).
         """
         assert self.binded
         return self._label_shapes
@@ -114,21 +120,24 @@ def label_shapes(self):
     @property
     def output_shapes(self):
         """Get output shapes.
+
         Returns
         -------
-        A list of `(name, shape)` pairs. The output shapes of the last
-        module is the output shape of a `SequentialModule`.
+        list
+            A list of `(name, shape)` pairs. The output shapes of the last
+            module is the output shape of a `SequentialModule`.
         """
         assert self.binded
         return self._modules[-1].output_shapes
 
     def get_params(self):
         """Get current parameters.
+
         Returns
         -------
-        `(arg_params, aux_params)`, each a dictionary of name to parameters (in
-        `NDArray`) mapping. This is a merged dictionary of all the parameters
-        in the modules.
+        (arg_params, aux_params)
+            each a dictionary of name to parameters (in `NDArray`) mapping. This
+            is a merged dictionary of all the parameters in the modules.
         """
         assert self.binded and self.params_initialized
 
@@ -353,9 +362,10 @@ def get_outputs(self, merge_multi_context=True):
 
         Returns
         -------
-        If `merge_multi_context` is `True`, it is like `[out1, out2]`. Otherwise, it
-        is like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`. All the output
-        elements are numpy arrays.
+        list of NDArray or list of list of NDArray
+            If `merge_multi_context` is `True`, it is like `[out1,
+            out2]`. Otherwise, it is like `[[out1_dev1, out1_dev2], [out2_dev1,
+            out2_dev2]]`. All the output elements are numpy arrays.
         """
         assert self.binded and self.params_initialized
         return self._modules[-1].get_outputs(merge_multi_context=merge_multi_context)
@@ -373,9 +383,10 @@ def get_input_grads(self, merge_multi_context=True):
 
         Returns
         -------
-        If `merge_multi_context` is `True`, it is like `[grad1, grad2]`. Otherwise, it
-        is like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`. All the output
-        elements are `NDArray`.
+        list of NDArray or list of list of NDArray
+            If `merge_multi_context` is `True`, it is like `[grad1, grad2]`. Otherwise, it
+            is like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`. All the output
+            elements are `NDArray`.
         """
         assert self.binded and self.params_initialized and self.inputs_need_grad
         return self._modules[0].get_input_grads(merge_multi_context=merge_multi_context)
diff --git a/python/mxnet/monitor.py b/python/mxnet/monitor.py
index 874103e606d8..92499314c4ac 100644
--- a/python/mxnet/monitor.py
+++ b/python/mxnet/monitor.py
@@ -21,13 +21,13 @@ class Monitor(object):
     interval : int
         Number of batches between printing.
     stat_func : function
-        a function that computes statistics of tensors.
-        Takes a NDArray and returns a NDArray. defaults to mean
+        A function that computes statistics of tensors.
+        Takes an ``NDArray`` and returns an ``NDArray``. Defaults to mean
         absolute value |x|/size(x).
     pattern : str
         A regular expression specifying which tensors to monitor.
-        Only tensors with names that match name_pattern will be included.
-        For example, '.*weight|.*output' will print all weights and outputs;
+        Only tensors with names that match ``name_pattern`` will be included.
+        For example, '.*weight|.*output' will print all weights and outputs and
         '.*backward.*' will print all gradients.
     """
     def __init__(self, interval, stat_func=None, pattern='.*', sort=False):
@@ -55,19 +55,19 @@ def stat_helper(name, array):
 
     def install(self, exe):
         """install callback to executor.
-        Supports installing to multiple exes
+        Supports installing to multiple exes.
 
         Parameters
         ----------
         exe : mx.executor.Executor
-            the Executor (returned by symbol.bind) to install to.
+            The Executor (returned by symbol.bind) to install to.
         """
         exe.set_monitor_callback(self.stat_helper)
         self.exes.append(exe)
 
     def tic(self):
-        """start collecting stats for current batch.
-        Call before forward"""
+        """Start collecting stats for current batch.
+        Call before calling forward."""
         if self.step % self.interval == 0:
             for exe in self.exes:
                 for array in exe.arg_arrays:
@@ -120,7 +120,7 @@ def toc(self):
         return res
 
     def toc_print(self):
-        """End collecting and print results"""
+        """End collecting and print results."""
         res = self.toc()
         for n, k, v in res:
             logging.info('Batch: {:7d} {:30s} {:s}'.format(n, k, v))
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 4235773857a0..9241d198955c 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -4,6 +4,10 @@
 """NDArray API of mxnet."""
 from __future__ import absolute_import
 from __future__ import division
+try:
+    from __builtin__ import slice as py_slice
+except ImportError:
+    from builtins import slice as py_slice
 
 import ctypes
 import warnings
@@ -54,13 +58,14 @@
 # pylint: enable= no-member
 
 def _new_empty_handle():
-    """Return a new empty handle.
+    """Returns a new empty handle.
 
-    Empty handle can be used to hold result
+    Empty handle can be used to hold result.
 
     Returns
     -------
-    a new empty ndarray handle
+    handle
+        A new empty NDArray handle.
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateNone(ctypes.byref(hdl)))
@@ -69,11 +74,12 @@ def _new_empty_handle():
 def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """Return a new handle with specified shape and context.
 
-    Empty handle is only used to hold results
+    Empty handle is only used to hold results.
 
     Returns
     -------
-    a new empty ndarray handle
+    handle
+        A new empty NDArray handle.
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
@@ -87,28 +93,31 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     return hdl
 
 def waitall():
-    """Wait all async operation to finish in MXNet
+    """Wait for all async operations to finish in MXNet.
 
-    This function is used for benchmark only
+    This function is used for benchmarking only.
     """
     check_call(_LIB.MXNDArrayWaitAll())
 
 class NDArray(NDArrayBase):
-    """NDArray object in mxnet.
+    """An array object representing a multidimensional, homogeneous array of
+fixed-size items.
 
-    NDArray is basic ndarray/Tensor like data structure in mxnet.
     """
     __slots__ = []
     # pylint: disable= no-member, undefined-variable
     def __repr__(self):
+        """Returns a string representation of the array."""
         shape_info = 'x'.join(['%d' % x for x in self.shape])
         return '<%s %s @%s>' % (self.__class__.__name__,
                                 shape_info, self.context)
 
     def __add__(self, other):
+        """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
         return add(self, other)
 
     def __iadd__(self, other):
+        """x.__iadd__(y) <=> x+=y """
         if not self.writable:
             raise ValueError('trying to add to a readonly NDArray')
         if isinstance(other, NDArray):
@@ -122,9 +131,11 @@ def __radd__(self, other):
         return self.__add__(other)
 
     def __sub__(self, other):
+        """x.__sub__(y) <=> x-y <=> mx.nd.subtract(x, y) """
         return subtract(self, other)
 
     def __isub__(self, other):
+        """x.__isub__(y) <=> x-=y """
         if not self.writable:
             raise ValueError('trying to subtract from a readonly NDArray')
         if isinstance(other, NDArray):
@@ -135,15 +146,19 @@ def __isub__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rsub__(self, other):
+        """x.__rsub__(y) <=> y-x <=> mx.nd.subtract(y, x) """
         return subtract(other, self)
 
     def __mul__(self, other):
+        """x.__mul__(y) <=> x*y <=> mx.nd.multiply(x, y) """
         return multiply(self, other)
 
     def __neg__(self):
+        """x.__neg__(y) <=> -x """
         return _internal._mul_scalar(self, -1.0)
 
     def __imul__(self, other):
+        """x.__imul__(y) <=> x*=y """
         if not self.writable:
             raise ValueError('trying to multiply to a readonly NDArray')
         if isinstance(other, NDArray):
@@ -157,12 +172,15 @@ def __rmul__(self, other):
         return self.__mul__(other)
 
     def __div__(self, other):
+        """x.__div__(y) <=> x/y <=> mx.nd.divide(x, y) """
         return divide(self, other)
 
     def __rdiv__(self, other):
+        """x.__rdiv__(y) <=> y/x <=> mx.nd.divide(y, x) """
         return divide(other, self)
 
     def __idiv__(self, other):
+        """x.__rdiv__(y) <=> x/=y """
         if not self.writable:
             raise ValueError('trying to divide from a readonly NDArray')
         if isinstance(other, NDArray):
@@ -182,27 +200,35 @@ def __itruediv__(self, other):
         return self.__idiv__(other)
 
     def __pow__(self, other):
+        """x.__pow__(y) <=> x**y <=> mx.nd.power(x,y) """
         return power(self, other)
 
     def __rpow__(self, other):
+        """x.__pow__(y) <=> y**x <=> mx.nd.power(y,x) """
         return power(other, self)
 
     def __eq__(self, other):
+        """x.__eq__(y) <=> x==y <=> mx.nd.equal(x, y) """
         return equal(self, other)
 
     def __ne__(self, other):
+        """x.__ne__(y) <=> x!=y <=> mx.nd.not_equal(x, y) """
         return not_equal(self, other)
 
     def __gt__(self, other):
+        """x.__gt__(y) <=> x>y <=> mx.nd.greater(x, y) """
         return greater(self, other)
 
     def __ge__(self, other):
+        """x.__ge__(y) <=> x>=y <=> mx.nd.greater_equal(x, y) """
         return greater_equal(self, other)
 
     def __lt__(self, other):
+        """x.__lt__(y) <=> x<y <=> mx.nd.lesser(x, y) """
         return lesser(self, other)
 
     def __le__(self, other):
+        """x.__le__(y) <=> x<=y <=> mx.nd.less_equal(x, y) """
         return lesser_equal(self, other)
 
     def __getstate__(self):
@@ -230,36 +256,57 @@ def __setstate__(self, state):
         else:
             self.handle = None
 
-    def __setitem__(self, in_slice, value):
-        """Set ndarray value.
-
-        `value` can be a scalar, an `NDArray` or numpy array of compatible shape.
-        The following modes are supported:
+    def __setitem__(self, key, value):
+        """x.__setitem__(i, y) <=> x[i]=y
 
-        - `array[:] = value`: set all the contents
-        - `array[i] = value`: set the i-th slice. If the array is of dimension
-          `(d1, d2, d3)`, it sets value of a slice of shape `(1, d2, d3)`.
-        - `array[i:j] = value`: similarly, if the array is of dimension
-          `(d1, d2, d3)`, it sets value of a slice of shape `(j-i, d2, d3)`.
+        Set self[key] to value.
 
-        Fully-dimensional indexing is also supported. For example, if array is
-        of shape `(d1, d2, d3)`, one can do
-
-        - `array[:, :, :] = value`: achieving the same effect of `array[:] = value`
-        - `array[:, i, j:k] = value`: each index could be a python slice or an int.
+        Parameters
+        ----------
+        key : int, slice or tuple
+            The indexing key.
+        value : scalar, NDArray or numpy.ndarray
+            The value to set.
+
+        Examples
+        --------
+        >>> x = mx.nd.zeros((2,3))
+        >>> x[:] = 1
+        >>> x.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> x.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> x[:,1:2] = 2
+        >>> x.asnumpy()
+        array([[ 1.,  2.,  1.],
+               [ 1.,  2.,  1.]], dtype=float32)
+        >>> x[1:2,1:] = 3
+        >>> x.asnumpy()
+        array([[ 1.,  2.,  1.],
+               [ 1.,  3.,  3.]], dtype=float32)
+        >>> x[1:,0:2] = mx.nd.zeros((1,2))
+        >>> x.asnumpy()
+        array([[ 1.,  2.,  1.],
+               [ 0.,  0.,  3.]], dtype=float32)
+        >>> x[1,2] = 4
+        >>> x.asnumpy()
+        array([[ 1.,  2.,  1.],
+               [ 0.,  0.,  4.]], dtype=float32)
         """
         # pylint: disable=too-many-branches
         if not self.writable:
-            raise ValueError('trying to assign to a readonly NDArray')
-        if isinstance(in_slice, int):
-            sliced_arr = self._at(in_slice)
+            raise ValueError('Failed to assign to a readonly NDArray')
+        if isinstance(key, int):
+            sliced_arr = self._at(key)
             sliced_arr[:] = value
             return
-        if isinstance(in_slice, slice):
-            if in_slice.step is not None:
-                raise ValueError('NDArray only support continuous slicing on axis 0')
-            if in_slice.start is not None or in_slice.stop is not None:
-                sliced_arr = self._slice(in_slice.start, in_slice.stop)
+        if isinstance(key, py_slice):
+            if key.step is not None:
+                raise ValueError('NDArray only supports continuous slicing on axis 0')
+            if key.start is not None or key.stop is not None:
+                sliced_arr = self._slice(key.start, key.stop)
                 sliced_arr[:] = value
                 return
             if isinstance(value, NDArray):
@@ -271,20 +318,20 @@ def __setitem__(self, in_slice, value):
                 self._sync_copyfrom(value)
             else:
                 raise TypeError('type %s not supported' % str(type(value)))
-        if isinstance(in_slice, tuple):
+        if isinstance(key, tuple):
             # multi-dimension indexing
             my_shape = self.shape
-            assert len(in_slice) == len(my_shape)
-            for slice_i in in_slice:
-                assert isinstance(slice_i, (slice, int))
+            assert len(key) == len(my_shape)
+            for slice_i in key:
+                assert isinstance(slice_i, (py_slice, int))
             begin = [0 for _ in my_shape]
             end = [x for x in my_shape]
-            for i, slice_i in enumerate(in_slice):
+            for i, slice_i in enumerate(key):
                 if isinstance(slice_i, int):
                     assert slice_i < my_shape[i]
                     begin[i] = slice_i
                     end[i] = slice_i + 1
-                if isinstance(slice_i, slice):
+                if isinstance(slice_i, py_slice):
                     # only support continuous slicing
                     assert slice_i.step is None
                     begin[i] = slice_i.start or 0
@@ -309,34 +356,65 @@ def __setitem__(self, in_slice, value):
                 raise TypeError('type %s not supported' % str(type(value)))
         # pylint: enable=too-many-branches
 
-    def __getitem__(self, in_slice):
-        """Get ndarray"""
-        if isinstance(in_slice, int):
-            return self._at(in_slice)
-        if not isinstance(in_slice, slice) or in_slice.step is not None:
-            raise ValueError('NDArray only support continuous slicing on axis 0')
-        if in_slice.start is not None or in_slice.stop is not None:
-            return self._slice(in_slice.start, in_slice.stop)
-        else:
-            return self
+    def __getitem__(self, key):
+        """x.__getitem__(i) <=> x[i]
+
+        Returns a sliced view of this array.
+
+        Parameters
+        ----------
+        key : int or slice
+            Indexing key.
+
+        Examples
+        --------
+        >>> x = mx.nd.arange(0,6).reshape((2,3))
+        >>> x.asnumpy()
+        array([[ 0.,  1.,  2.],
+               [ 3.,  4.,  5.]], dtype=float32)
+        >>> x[1].asnumpy()
+        array([ 3.,  4.,  5.], dtype=float32)
+        >>> y = x[0:1]
+        >>> y[:] = 2
+        >>> x.asnumpy()
+        array([[ 2.,  2.,  2.],
+               [ 3.,  4.,  5.]], dtype=float32)
+        """
+        # multi-dimensional slicing is not supported yet
+        if isinstance(key, int):
+            if key > self.shape[0] - 1:
+                raise IndexError(
+                    'index {} is out of bounds for axis 0 with size {}'.format(
+                        key, self.shape[0]))
+            return self._at(key)
+        if isinstance(key, py_slice):
+            if key.step is not None:
+                raise ValueError('NDArray only supports continuous slicing on axis 0')
+            if key.start is not None or key.stop is not None:
+                return self._slice(key.start, key.stop)
+            else:
+                return self
+        if isinstance(key, tuple):
+            raise ValueError('Multi-dimension indexing is not supported')
+
 
     def _sync_copyfrom(self, source_array):
-        """Peform an synchronize copy from the array.
+        """Peforms a synchronized copy from the array.
 
         Parameters
         ----------
-        source_array : array_like
-            The data source we should like to copy from.
+        source_array : array_like)
+            The data source we would like to copy from.
         """
         if not isinstance(source_array, np.ndarray):
             try:
                 source_array = np.array(source_array, dtype=self.dtype)
             except:
-                raise TypeError('array must be an array_like data,' +
+                raise TypeError('array must consist of array-like data,' +
                                 'type %s is not supported' % str(type(array)))
         source_array = np.ascontiguousarray(source_array, dtype=self.dtype)
         if source_array.shape != self.shape:
-            raise ValueError('Shape inconsistant: expected %s vs got %s'%(
+            raise ValueError('Shape inconsistent: expected %s vs got %s'%(
                 str(self.shape), str(source_array.shape)))
         check_call(_LIB.MXNDArraySyncCopyFromCPU(
             self.handle,
@@ -344,7 +422,7 @@ def _sync_copyfrom(self, source_array):
             ctypes.c_size_t(source_array.size)))
 
     def _slice(self, start, stop):
-        """Return a sliced NDArray that shares memory with current one.
+        """Returns a sliced NDArray that shares memory with current one.
 
         Parameters
         ----------
@@ -361,7 +439,7 @@ def _slice(self, start, stop):
         return NDArray(handle=handle, writable=self.writable)
 
     def _at(self, idx):
-        """Return a sub NDArray that shares memory with current one.
+        """Returns a sliced view of this array.
 
         Parameters
         ----------
@@ -374,30 +452,103 @@ def _at(self, idx):
             self.handle, idx, ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
-    def reshape(self, new_shape):
-        """Return a reshaped NDArray that shares memory with current one.
+    def reshape(self, shape):
+        """Returns a view of this array with a new shape without altering any data.
 
         Parameters
         ----------
-        new_shape : iterable of int
-            new shape of NDArray
+        shape : tuple of int
+            The new shape should not change the array size, namely
+            ``np.prod(new_shape)`` should be equal to ``np.prod(self.shape)``.
+            One shape dimension can be -1. In this case, the value is inferred
+            from the length of the array and remaining dimensions.
+
+
+        Returns
+        -------
+        NDArray
+            An array with desired shape that shares data with this array.
+
+        Examples
+        --------
+        >>> x = mx.nd.arange(0,6).reshape((2,3))
+        >>> x.asnumpy()
+        array([[ 0.,  1.,  2.],
+               [ 3.,  4.,  5.]], dtype=float32)
+        >>> y = x.reshape((3,2))
+        >>> y.asnumpy()
+        array([[ 0.,  1.],
+               [ 2.,  3.],
+               [ 4.,  5.]], dtype=float32)
+        >>> y = x.reshape((3,-1))
+        >>> y.asnumpy()
+        array([[ 0.,  1.],
+               [ 2.,  3.],
+               [ 4.,  5.]], dtype=float32)
+        >>> y[:] = -1
+        >>> x.asnumpy()
+        array([[-1., -1., -1.],
+               [-1., -1., -1.]], dtype=float32)
         """
         handle = NDArrayHandle()
+
+        # Infer the correct size for dim == -1
+        shape = list(shape)
+        for index, element in enumerate(shape):
+            if element == -1:
+                remainder = list(self.shape)
+                for i, e in enumerate(shape):  # pylint: disable=invalid-name
+                    if i != index and e == -1:
+                        raise ValueError('Only one dimension can be inferred.')
+                    try:
+                        remainder.remove(e)
+                    except ValueError:
+                        pass
+                shape[index] = np.product(remainder)
+                # We have already gone through the whole shape, break
+                break
+
+        # Actual reshape
         check_call(_LIB.MXNDArrayReshape(self.handle,
-                                         len(new_shape),
-                                         c_array(ctypes.c_int, new_shape),
+                                         len(shape),
+                                         c_array(ctypes.c_int, shape),
                                          ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
     # pylint: disable= undefined-variable
     def broadcast_to(self, shape):
-        """ Broadcasting the current NDArray into the given shape. The semantics is
-        the same with `numpy`'s broadcasting
+        """Broadcasts an array to a new shape.
+
+        Broadcast only allows on axes with size 1. The new shape cannot change
+        the number of dimensions such as from 2D to 3D.
 
         Parameters
-        ---------
-        shape : the shape to broadcast
-            the broadcast shape
+        ----------
+        shape : tuple of int
+            The shape of the desired array.
+
+        Returns
+        -------
+        NDArray
+            A NDArray with the desired shape that is not sharing data with this
+            array, even if the new shape is the same as ``self.shape``.
+
+        Examples
+        --------
+        >>> x = mx.nd.arange(0,3).reshape((1,3,1))
+        >>> x.asnumpy()
+        array([[[ 0.],
+                [ 1.],
+                [ 2.]]], dtype=float32)
+        >>> y = x.broadcast_to((2,3,3))
+        >>> y.asnumpy()
+        array([[[ 0.,  0.,  0.],
+                [ 1.,  1.,  1.],
+                [ 2.,  2.,  2.]],
+        <BLANKLINE>
+               [[ 0.,  0.,  0.],
+                [ 1.,  1.,  1.],
+                [ 2.,  2.,  2.]]], dtype=float32)
         """
         cur_shape = self.shape
         err_str = 'operands could not be broadcast together with remapped shapes' \
@@ -416,21 +567,54 @@ def broadcast_to(self, shape):
     # pylint: enable= undefined-variable
 
     def wait_to_read(self):
-        """Block until all pending writes operations on current NDArray are finished.
-
-        This function will return when all the pending writes to the current
-        NDArray finishes. There can still be pending read going on when the
-        function returns.
+        """Waits until all previous write operations on the current array are finished.
+
+        This method guarantees that all previous write operations that pushed
+        into the backend engine for execution are actually finished.
+
+        Examples
+        --------
+        >>> import time
+        >>> tic = time.time()
+        >>> a = mx.nd.ones((1000,1000))
+        >>> b = mx.nd.dot(a, a)
+        >>> print(time.time() - tic) # doctest: +SKIP
+        0.003854036331176758
+        >>> b.wait_to_read()
+        >>> print(time.time() - tic) # doctest: +SKIP
+        0.0893700122833252
         """
         check_call(_LIB.MXNDArrayWaitToRead(self.handle))
 
+
     @property
-    def shape(self):
-        """Get shape of current NDArray.
+    def ndim(self):
+        """Returns the number of dimensions of this array
+
+        Examples
+        --------
+        >>> x = mx.nd.array([1, 2, 3, 4])
+        >>> x.ndim
+        1
+        >>> x = mx.nd.array([[1, 2],
+                             [3, 4]])
+        >>> x.ndim
+        2
+        """
+        return len(self.shape)
 
-        Returns
-        -------
-        a tuple representing shape of current ndarray
+    @property
+    def shape(self):
+        """Tuple of array dimensions.
+
+        Examples
+        --------
+        >>> x = mx.nd.array([1, 2, 3, 4])
+        >>> x.shape
+        (4L,)
+        >>> y = mx.nd.zeros((2, 3, 4))
+        >>> y.shape
+        (2L, 3L, 4L)
         """
         ndim = mx_uint()
         pdata = ctypes.POINTER(mx_uint)()
@@ -440,22 +624,35 @@ def shape(self):
 
     @property
     def size(self):
-        """Get size of current NDArray.
-
-        Returns
-        -------
-        an int representing size of current ndarray
+        """Number of elements in the array.
+
+        Equivalent to the product of the array’s dimensions.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> x = mx.nd.zeros((3, 5, 2))
+        >>> x.size
+        30
+        >>> np.prod(x.shape)
+        30
         """
         return np.prod(self.shape)
 
     @property
     def context(self):
-        """Get context of current NDArray.
-
-        Returns
-        -------
-        context : mxnet.Context
-            The context of current NDArray.
+        """Device context of the array.
+
+        Examples
+        --------
+        >>> x = mx.nd.array([1, 2, 3, 4])
+        >>> x.context
+        cpu(0)
+        >>> type(x.context)
+        <class 'mxnet.context.Context'>
+        >>> y = mx.nd.zeros((2,3), mx.gpu(0))
+        >>> y.context
+        gpu(0)
         """
         dev_typeid = ctypes.c_int()
         dev_id = ctypes.c_int()
@@ -465,11 +662,21 @@ def context(self):
 
     @property
     def dtype(self):
-        """Get data type of current NDArray.
+        """Data-type of the array’s elements.
 
         Returns
         -------
-        an numpy.dtype object representing type of current ndarray
+        numpy.dtype
+            This NDArray's data type.
+
+        Examples
+        --------
+        >>> x = mx.nd.zeros((2,3))
+        >>> x.dtype
+        <type 'numpy.float32'>
+        >>> y = mx.nd.zeros((2,3), dtype='int32')
+        >>> y.dtype
+        <type 'numpy.int32'>
         """
         mx_dtype = ctypes.c_int()
         check_call(_LIB.MXNDArrayGetDType(
@@ -479,19 +686,47 @@ def dtype(self):
     @property
     # pylint: disable= invalid-name, undefined-variable
     def T(self):
-        """Get transpose of current NDArray"""
-        if len(self.shape) != 2:
-            raise ValueError('Only 2D matrix is allowed to be transposed')
+        """Returns a copy of the array with axes transposed.
+
+        Equivalent to ``mx.nd.transpose(self)`` except that
+        self is returned if ``self.ndim < 2``.
+
+        Unlike ``numpy.ndarray.T``, this function returns a copy
+        rather than a view of the array unless ``self.ndim < 2``.
+
+        Examples
+        --------
+        >>> x = mx.nd.arange(0,6).reshape((2,3))
+        >>> x.asnumpy()
+        array([[ 0.,  1.,  2.],
+               [ 3.,  4.,  5.]], dtype=float32)
+        >>> x.T.asnumpy()
+        array([[ 0.,  3.],
+               [ 1.,  4.],
+               [ 2.,  5.]], dtype=float32)
+
+        """
+        if len(self.shape) < 2:
+            return self
         return transpose(self)
     # pylint: enable= invalid-name, undefined-variable
 
     def asnumpy(self):
-        """Return a copied numpy array of current array.
-
-        Returns
-        -------
-        array : numpy.ndarray
-            A copy of array content.
+        """Returns a ``numpy.ndarray`` object with value copied from this array.
+
+        Examples
+        --------
+        >>> x = mx.nd.ones((2,3))
+        >>> y = x.asnumpy()
+        >>> type(y)
+        <type 'numpy.ndarray'>
+        >>> y
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> z = mx.nd.ones((2,3), dtype='int32')
+        >>> z.asnumpy()
+        array([[1, 1, 1],
+               [1, 1, 1]], dtype=int32)
         """
         data = np.empty(self.shape, dtype=self.dtype)
         check_call(_LIB.MXNDArraySyncCopyToCPU(
@@ -501,176 +736,447 @@ def asnumpy(self):
         return data
 
     def asscalar(self):
-        """Return a CPU scalar(float) of current ndarray.
+        """Returns a scalar whose value is copied from this array.
 
-        This ndarray must have shape (1,)
+        This function is equivalent to ``self.asnumpy()[0]``. This NDArray must have shape (1,).
 
-        Returns
-        -------
-        scalar : np.float
-            The scalar representation of the ndarray.
+        Examples
+        --------
+        >>> x = mx.nd.ones((1,), dtype='int32')
+        >>> x.asscalar()
+        1
+        >>> type(x.asscalar())
+        <type 'numpy.int32'>
         """
         if self.shape != (1,):
             raise ValueError("The current array is not a scalar")
         return self.asnumpy()[0]
 
     def astype(self, dtype):
-        """Return a copied numpy array of current array with specified type.
+        """Returns a copy of the array after casting to a specified type.
 
         Parameters
         ----------
-        dtype : str or numpy.dtype
-            Desired type of result array.
-
-        Returns
-        -------
-        array : numpy.ndarray
-            A copy of array content.
+        dtype : numpy.dtype or str
+            The type of the returned array.
+
+        Examples
+        --------
+        >>> x = mx.nd.zeros((2,3), dtype='float32')
+        >>> y = x.astype('int32')
+        >>> y.dtype
+        <type 'numpy.int32'>
         """
         res = empty(self.shape, ctx=self.context, dtype=dtype)
         self.copyto(res)
         return res
 
     def copyto(self, other):
-        """Copy the content of current array to other.
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``NDArray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
 
-        When other is NDArray, the content is copied over.
-        When other is a Context, a new NDArray in the context
-        will be created as target
+        If ``other`` is a context, a new ``NDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
 
         Parameters
         ----------
         other : NDArray or Context
-            Target NDArray or context we want to copy data to.
+            The destination array or context.
 
         Returns
         -------
-        dst : NDArray
-            The copy target NDArray
+        NDArray
+            The copied array. If ``other`` is an ``NDArray``, then the return value
+            and ``other`` will point to the same ``NDArray``.
+
+        Examples
+        --------
+        >>> x = mx.nd.ones((2,3))
+        >>> y = mx.nd.zeros((2,3), mx.gpu(0))
+        >>> z = x.copyto(y)
+        >>> z is y
+        True
+        >>> y.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> y.copyto(mx.gpu(0))
+        <NDArray 2x3 @gpu(0)>
+
         """
         if isinstance(other, NDArray):
             if other.handle is self.handle:
-                warnings.warn('copy an array to itself, is it intended?',
-                              RuntimeWarning)
+                warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
                 return
             return _internal._copyto(self, out=other)
         elif isinstance(other, Context):
             hret = NDArray(_new_alloc_handle(self.shape, other, True, self.dtype))
             return _internal._copyto(self, out=hret)
         else:
-            raise TypeError('copyto do not support type ' + str(type(other)))
+            raise TypeError('copyto does not support type ' + str(type(other)))
 
     def copy(self):
-        """Make a copy of the current ndarray on the same context
+        """Makes a copy of this ``NDArray``, keeping the same context.
 
-        Return
-        ------
-        cpy : NDArray
-            The copy
+        Returns
+        -------
+        NDArray
+            The copied array
+
+        Examples
+        --------
+        >>> x = mx.nd.ones((2,3))
+        >>> y = x.copy()
+        >>> y.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
         """
         return self.copyto(self.context)
 
-    # pylint: enable= no-member
-
     def as_in_context(self, context):
-        """Return an `NDArray` that lives in the target context. If the array
-        is already in that context, `self` is returned. Otherwise, a copy is
-        made.
+        """Returns an array on the target device with the same value as this array.
+
+        If the target context is the same as ``self.context``, then ``self`` is
+        returned.  Otherwise, a copy is made.
 
         Parameters
         ----------
         context : Context
-            The target context we want the return value to live in.
+            The target context.
 
         Returns
         -------
-        A copy or `self` as an `NDArray` that lives in the target context.
+        NDArray
+            The target array.
+
+
+        Examples
+        --------
+        >>> x = mx.nd.ones((2,3))
+        >>> y = x.as_in_context(mx.cpu())
+        >>> y is x
+        True
+        >>> z = x.as_in_context(mx.gpu(0))
+        >>> z is x
+        False
         """
         if self.context == context:
             return self
         return self.copyto(context)
 
-
 _init_ndarray_module(NDArray, "mxnet")
 
-
 def onehot_encode(indices, out):
-    """One hot encoding indices into matrix out.
+    """One-hot encoding indices into matrix out.
+
+    Deprecated, use ``one_hot`` instead.
+    """
+    # pylint: disable= no-member, protected-access
+    return _internal._onehot_encode(indices, out, out=out)
+    # pylint: enable= no-member, protected-access
+
+
+def empty(shape, ctx=None, dtype=mx_real_t):
+    """Returns a new array of given shape and type, without initializing entries.
 
     Parameters
     ----------
-    indices: NDArray
-        An NDArray containing indices of the categorical features.
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+
+    Returns
+    -------
+    NDArray
+        A created array.
+
+    Examples
+    --------
+    >>> mx.nd.empty(1)
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.empty((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.empty((1,2), mx.gpu(0), 'float16')
+    <NDArray 1x2 @gpu(0)>
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    if ctx is None:
+        ctx = Context.default_ctx
+    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+
+def zeros(shape, ctx=None, dtype=mx_real_t):
+    """Returns a new array filled with all zeros, with the given shape and type.
 
-    out: NDArray
-        The result holder of the encoding.
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
 
     Returns
     -------
-    out: Array
-        Same as out.
+    NDArray
+        A created array
+
+    Examples
+    --------
+    >>> mx.nd.zeros(1).asnumpy()
+    array([ 0.], dtype=float32)
+    >>> mx.nd.zeros((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
     """
+    if ctx is None:
+        ctx = Context.default_ctx
     # pylint: disable= no-member, protected-access
-    return _internal._onehot_encode(indices, out, out=out)
+    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype)
+    # pylint: enable= no-member, protected-access
+
+def ones(shape, ctx=None, dtype=mx_real_t):
+    """Returns a new array filled with all ones, with the given shape and type.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context.
+        Defaults to the current default context (``mxnet.Context.default_ctx``).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+
+    Returns
+    -------
+    NDArray
+        A new array of the specified shape filled with all ones.
+
+    Examples
+    --------
+    >>> mx.nd.ones(1).asnumpy()
+    array([ 1.], dtype=float32)
+    >>> mx.nd.ones((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.ones((1,2), dtype='float16').asnumpy()
+    array([[ 1.,  1.]], dtype=float16)
+    """
+    if ctx is None:
+        ctx = Context.default_ctx
+    # pylint: disable= no-member, protected-access
+    return _internal._ones(shape=shape, ctx=ctx, dtype=dtype)
     # pylint: enable= no-member, protected-access
 
+def full(shape, val, ctx=None, dtype=mx_real_t):
+    """Returns a new array of given shape and type, filled with the given value ``val``.
 
-def empty(shape, ctx=None, dtype=None):
-    """Create an empty uninitialized new NDArray, with specified shape.
+    Parameters
+    --------
+    shape : int or tuple of int)
+        The shape of the empty array.
+    val : scalar
+        Fill value
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+
+    Returns
+    -------
+    NDArray
+        A created array
+
+    Examples
+    --------
+    >>> mx.nd.full(1, 2.0).asnumpy()
+    array([ 2.], dtype=float32)
+    >>> mx.nd.full((1, 2), 2.0, mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.full((1, 2), 2.0, dtype='float16').asnumpy()
+    array([[ 2.,  2.]], dtype=float16)
+    """
+    arr = empty(shape, ctx, dtype)
+    arr[:] = val
+    return arr
+
+
+def array(source_array, ctx=None, dtype=None):
+    """Creates a new array from any object exposing the array interface.
 
     Parameters
     ----------
-    shape : tuple
-        shape of the NDArray.
+    source_array : array_like
+        Any object exposing the array interface, an object whose ``__array__``
+        method returns an array, or any (nested) sequence.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+
+        An optional value type. If the ``source_array`` is an NDArray, then defaults to
+        ``source_array.dtype``, otherwise default to ``float32``.
+
+    Returns
+    -------
+    NDArray
+        An ``NDArray`` array with the same contets as the ``source_array``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> mx.nd.array([1, 2, 3])
+    <NDArray 3 @cpu(0)>
+    >>> mx.nd.array([[1, 2], [3, 4]])
+    <NDArray 2x2 @cpu(0)>
+    >>> mx.nd.array(np.zeros((3,2)))
+    <NDArray 3x2 @cpu(0)>
+    >>> mx.nd.array(np.zeros((3,2)), mx.gpu(0))
+    <NDArray 3x2 @gpu(0)>
+    """
+    if isinstance(source_array, NDArray):
+        dtype = source_array.dtype if dtype is None else dtype
+    else:
+        dtype = mx_real_t if dtype is None else dtype
+        if not isinstance(source_array, np.ndarray):
+            try:
+                source_array = np.array(source_array, dtype=dtype)
+            except:
+                raise TypeError('source_array must be array like object')
+    arr = empty(source_array.shape, ctx, dtype)
+    arr[:] = source_array
+    return arr
+
 
+def moveaxis(tensor, source, destination):
+    """Moves the `source` axis into the `destination` position
+    while leaving the other axes in their original order
+
+    Parameters
+    ----------
+    tensor : mx.nd.array
+        The array which axes should be reordered
+    source : int
+        Original position of the axes to move.
+    destination : int
+        Destination position for each of the original axes.
+
+    Returns
+    -------
+    result : mx.nd.array
+        Array with moved axes.
+
+    Examples
+    --------
+    >>> X = mx.nd.array([[1, 2, 3],
+                         [4, 5, 6]])
+    >>> mx.nd.moveaxis(X, 0, 1).shape
+    (3, 2)
+    """
+    axes = list(range(tensor.ndim))
+    try:
+        axes.pop(source)
+    except IndexError:
+        raise ValueError('Source should verify 0 <= source < tensor.ndim'
+                         'Got %d' % source)
+    try:
+        axes.insert(destination, source)
+    except IndexError:
+        raise ValueError('Destination should verify 0 <= destination < tensor.ndim'
+                         'Got %d' % destination)
+    return transpose(tensor, axes)
+
+
+# pylint: disable= no-member, protected-access, too-many-arguments
+def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
+    """Returns evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval [start, stop). In other
+    words, the interval includes start but excludes stop. For integer
+    arguments, the function is equivalent to the built-in Python function ``range``
+    and to ``numpy.arange``, but returns an ``NDArray``.
+
+    Parameters
+    ----------
+    start : int, optional
+        An optional start of interval, the default value is 0.
+    stop : int
+        The end of interval.
+    step : int, optional
+        A optional spacing between values, the default value is 1.
+    repeat : int, optional
+        The repeating time of all elements.
     ctx : Context, optional
-        The context of the NDArray, default to current default context.
+        An optional device context (default is the current default context)
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
 
     dtype : str or numpy.dtype, optional
-        The value type of the NDArray, default to np.float32
+        The value type of the NDArray, default to np.float32.
 
     Returns
     -------
-    out: Array
-        The created NDArray.
+    NDArray
+        The created NDArray
+
+    Examples
+    --------
+    >>> mx.nd.arange(3).asnumpy()
+    array([ 0.,  1.,  2.], dtype=float32)
+    >>> mx.nd.arange(2,6).asnumpy()
+    array([ 2.,  3.,  4.,  5.], dtype=float32)
+    >>> mx.nd.arange(2,6,2).asnumpy()
+    array([ 2.,  4.], dtype=float32)
+    >>> mx.nd.arange(2,6,step=2).asnumpy()
+    array([ 2.,  4.], dtype=float32)
+    >>> mx.nd.arange(2,6,step=2,repeat=2).asnumpy()
+    array([ 2.,  2.,  4.,  4.], dtype=float32)
+    >>> mx.nd.arange(2,6,step=2,repeat=3).asnumpy()
+    array([ 2.,  2.,  2.,  4.,  4.,  4.], dtype=float32)
+    >>> mx.nd.arange(2,6,step=2,repeat=3,dtype='int32').asnumpy()
+    array([2, 2, 2, 4, 4, 4], dtype=int32)
     """
-    if isinstance(shape, int):
-        shape = (shape, )
     if ctx is None:
         ctx = Context.default_ctx
-    if dtype is None:
-        dtype = mx_real_t
-    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+    return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
+                             dtype=dtype, ctx=str(ctx))
+# pylint: enable= no-member, protected-access, too-many-arguments
 
 #pylint: disable= too-many-arguments, no-member, protected-access
 def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
-    """ Helper function for element-wise operation
-    The function will perform numpy-like broadcasting if needed and call different functions
+    """ Helper function for element-wise operation.
+    The function will perform numpy-like broadcasting if needed and call different functions.
 
     Parameters
-    ----------
+    --------
     lhs : NDArray or numeric value
-        left hande side operand
+        Left-hand side operand.
 
     rhs : NDArray or numeric value
-        right hand side operand
+        Right-hand operand,
 
     fn_array : function
-        function to be called if both lhs and rhs are of NDArray type
+        Function to be called if both lhs and rhs are of ``NDArray`` type.
 
     fn_scalar : function
-        function to be called if both lhs and rhs are numeric values
+        Function to be called if both lhs and rhs are numeric values.
 
     lfn_scalar : function
-        function to be called if lhs is NDArray while rhs is numeric value
+        Function to be called if lhs is ``NDArray`` while rhs is numeric value
 
     rfn_scalar : function
-        function to be called if lhs is numeric value while rhs is NDArray;
+        Function to be called if lhs is numeric value while rhs is ``NDArray``;
         if none is provided, then the function is commutative, so rfn_scalar is equal to lfn_scalar
 
     Returns
-    -------
-    out: NDArray
+    --------
+    NDArray
         result array
     """
     if isinstance(lhs, numeric_types):
@@ -691,20 +1197,36 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
 #pylint: enable= too-many-arguments, no-member, protected-access
 
 def add(lhs, rhs):
-    """ Perform element-wise addition
+    """Add arguments, element-wise with broadcasting.
+
+    Equivalent to ``lhs + rhs``
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        The sum of lhs and rhs, element-wise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x+2).asnumpy()
+    array([[ 3.,  3.,  3.],
+           [ 3.,  3.,  3.]], dtype=float32)
+    >>> (x+y).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 2.,  2.,  2.]], dtype=float32)
+    >>> (z + y).asnumpy()
+    array([[ 0.,  1.],
+           [ 1.,  2.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -717,20 +1239,36 @@ def add(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def subtract(lhs, rhs):
-    """ Perform element-wise subtract
+    """Subtracts arguments element-wise with broadcasting.
+
+    Equivalent to ``lhs - rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        The difference of lhs and rhs, element-wise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x-2).asnumpy()
+    array([[-1., -1., -1.],
+           [-1., -1., -1.]], dtype=float32)
+    >>> (x-y).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (z-y).asnumpy()
+    array([[ 0.,  1.],
+           [-1.,  0.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -743,20 +1281,36 @@ def subtract(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def multiply(lhs, rhs):
-    """ Perform element-wise multiplication
+    """Multiplies arguments element-wise with broadcasting.
+
+    Equivalent to ``lhs * rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        The multiplication of lhs and rhs, element-wise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x*2).asnumpy()
+    array([[ 2.,  2.,  2.],
+           [ 2.,  2.,  2.]], dtype=float32)
+    >>> (x*y).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (z*y).asnumpy()
+    array([[ 0.,  0.],
+           [ 0.,  1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -769,20 +1323,38 @@ def multiply(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def divide(lhs, rhs):
-    """ Perform element-wise divide
+    """Divides arguments element-wise with broadcasting.
+
+    Equivalent to ``lhs / rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        The quotient of ``lhs/rhs``, element-wise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> x/2
+    <NDArray 2x3 @cpu(0)>
+    >>> (x/2).asnumpy()
+    array([[ 0.5,  0.5,  0.5],
+           [ 0.5,  0.5,  0.5]], dtype=float32)
+    >>> (x/y).asnumpy()
+    array([[ inf,  inf,  inf],
+           [  1.,   1.,   1.]], dtype=float32)
+    >>> (y/z).asnumpy()
+    array([[ nan,   0.],
+           [ inf,   1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -794,26 +1366,43 @@ def divide(lhs, rhs):
         _internal._rdiv_scalar)
     # pylint: enable= no-member, protected-access
 
-def power(lhs, rhs):
-    """ Perform power operator
+def power(base, exp):
+    """First array elements raised to powers from second array, element-wise
+    with broadcasting.
+
+    Equivalent to ``base ** exp``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    base : scalar or NDArray
+    exp : scalar or NDArray
+        The arrays to be added. If ``base.shape != exp.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
-    -------
-    out: Array
-        result array
+    --------
+    NDArray
+        The bases in x raised to the exponents in y.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))*2
+    >>> y = mx.nd.arange(1,3).reshape((2,1))
+    >>> z = mx.nd.arange(1,3).reshape((2,1))
+    >>> (x**2).asnumpy()
+    array([[ 4.,  4.,  4.],
+           [ 4.,  4.,  4.]], dtype=float32)
+    >>> (x**y).asnumpy()
+    array([[ 2.,  2.,  2.],
+           [ 4.,  4.,  4.]], dtype=float32)
+    >>> (z**y).asnumpy()
+    array([[ 1.],
+           [ 4.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
-        lhs,
-        rhs,
+        base,
+        exp,
         broadcast_power,
         operator.pow,
         _internal._power_scalar,
@@ -821,20 +1410,34 @@ def power(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def maximum(lhs, rhs):
-    """ Perform maximum operator
+    """Element-wise maximum of array elements with broadcasting.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        The maximum of lhs and rhs, element-wise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> mx.nd.maximum(x, 2).asnumpy()
+    array([[ 2.,  2.,  2.],
+           [ 2.,  2.,  2.]], dtype=float32)
+    >>> mx.nd.maximum(x, y).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> mx.nd.maximum(y, z).asnumpy()
+    array([[ 0.,  1.],
+           [ 1.,  1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -847,20 +1450,34 @@ def maximum(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def minimum(lhs, rhs):
-    """ Perform minimum operator
+    """Element-wise minimum of array elements with broadcasting.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        The minimum of lhs and rhs, element-wise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> mx.nd.minimum(x, 2).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> mx.nd.minimum(x, y).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> mx.nd.minimum(z, y).asnumpy()
+    array([[ 0.,  0.],
+           [ 0.,  1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -873,20 +1490,36 @@ def minimum(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def equal(lhs, rhs):
-    """Return (lhs == rhs) element-wise.
+    """Returns (lhs == rhs), element-wise with broadcasting.
+
+    Equivalent to ``lhs == rhs``
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        For each element in lhs, rhs, return True if lhs is equal to rhs and False otherwise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x == 1).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (x == y).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (z == y).asnumpy()
+    array([[ 1.,  0.],
+           [ 0.,  1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -899,20 +1532,39 @@ def equal(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def not_equal(lhs, rhs):
-    """Return (lhs != rhs) element-wise.
+    """Returns (lhs != rhs), element-wise with broadcasting.
+
+    Equivalent to ``lhs != rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape,
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        For each element in lhs, rhs, return True if lhs is not equal to rhs and False otherwise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (z == y).asnumpy()
+    array([[ 1.,  0.],
+           [ 0.,  1.]], dtype=float32)
+    >>> (x != 1).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (x != y).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (z != y).asnumpy()
+    array([[ 0.,  1.],
+           [ 1.,  0.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -925,20 +1577,36 @@ def not_equal(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def greater(lhs, rhs):
-    """Return (lhs > rhs) element-wise.
+    """Returns (lhs > rhs), element-wise with broadcasting.
+
+    Equivalent to ``lhs > rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        For each element in lhs, rhs, return True if lhs is greater than rhs and False otherwise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x > 1).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (x > y).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (z > y).asnumpy()
+    array([[ 0.,  1.],
+           [ 0.,  0.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -951,20 +1619,37 @@ def greater(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def greater_equal(lhs, rhs):
-    """Return (lhs >= rhs) element-wise.
+    """Returns (lhs >= rhs), element-wise with broadcasting.
+
+    Equivalent to ``lhs >= rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        For each element in lhs, rhs, return True if lhs is greater equal than
+        rhs and False otherwise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x >= 1).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (x >= y).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (z >= y).asnumpy()
+    array([[ 1.,  1.],
+           [ 0.,  1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -977,20 +1662,36 @@ def greater_equal(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def lesser(lhs, rhs):
-    """Return (lhs < rhs) element-wise.
+    """Returns (lhs < rhs), element-wise with broadcasting.
+
+    Equivalent to ``lhs < rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        For each element in lhs, rhs, return True if lhs is lesser than rhs and False otherwise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x < 1).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (x < y).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 0.,  0.,  0.]], dtype=float32)
+    >>> (z < y).asnumpy()
+    array([[ 0.,  0.],
+           [ 1.,  0.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -1004,20 +1705,37 @@ def lesser(lhs, rhs):
 
 
 def lesser_equal(lhs, rhs):
-    """Return (lhs <= rhs) element-wise.
+    """Returns (lhs <= rhs), element-wise with broadcasting.
+
+    Equivalent to ``lhs <= rhs``.
 
     Parameters
     ----------
-    lhs : Array or float value
-        left hand side operand
-
-    rhs : Array of float value
-        right hand side operand
+    lhs : scalar or array
+    rhs : scalar or array
+        The arrays to be added. If ``lhs.shape != rhs.shape``, they must be
+        broadcastable to a common shape.
 
     Returns
     -------
-    out: Array
-        result array
+    NDArray
+        For each element in lhs, rhs, return True if lhs is lesser equal than
+        rhs and False otherwise.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> y = mx.nd.arange(2).reshape((2,1))
+    >>> z = mx.nd.arange(2).reshape((1,2))
+    >>> (x <= 1).asnumpy()
+    array([[ 1.,  1.,  1.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (x <= y).asnumpy()
+    array([[ 0.,  0.,  0.],
+           [ 1.,  1.,  1.]], dtype=float32)
+    >>> (z <= y).asnumpy()
+    array([[ 1.,  0.],
+           [ 1.,  1.]], dtype=float32)
     """
     # pylint: disable= no-member, protected-access
     return _ufunc_helper(
@@ -1030,124 +1748,124 @@ def lesser_equal(lhs, rhs):
     # pylint: enable= no-member, protected-access
 
 def true_divide(lhs, rhs):
-    """ Same as numpy's true_divide. It adjusts the output type to present the best answer,
-    regardless of input types.
+    """Same as ``divide``.
     """
     return divide(lhs, rhs)
 
 def negative(arr):
-    """ Return the negation of array values """
-    return multiply(arr, -1.0)
+    """Numerical negative, element-wise.
 
-def zeros(shape, ctx=None, dtype=None):
-    """Create a new NDArray filled with 0, with specified shape.
+    Equals ``-arr``
 
     Parameters
     ----------
-    shape : tuple
-        shape of the NDArray.
-    ctx : Context, optional.
-        The context of the NDArray, default to current default context.
-    dtype : str or numpy.dtype, optional
-        The value type of the NDArray, default to np.float32
+    arr : NDArray
+        The input array
 
     Returns
     -------
-    out: Array
-        The created NDArray.
+    NDArray
+        ``-arr``
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((2,3))
+    >>> (-x).asnumpy()
+    array([[-1., -1., -1.],
+           [-1., -1., -1.]], dtype=float32)
     """
-    if ctx is None:
-        ctx = Context.default_ctx
-    if dtype is None:
-        dtype = mx_real_t
-    # pylint: disable= no-member, protected-access
-    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype)
-    # pylint: enable= no-member, protected-access
+    return multiply(arr, -1.0)
+
+def load(fname):
+    """Load array from file.
 
-def ones(shape, ctx=None, dtype=None):
-    """Create a new NDArray filled with 1, with specified shape.
+    See more details in ``save``.
 
     Parameters
     ----------
-    shape : tuple
-        shape of the NDArray.
-    ctx : Context, optional
-        The context of the NDArray, default to current default context.
-    dtype : str or numpy.dtype, optional
-        The value type of the NDArray, default to np.float32
+    fname : str
+        The filename.
 
     Returns
     -------
-    out: Array
-        The created NDArray.
+    list of NDArray or dict of str to NDArray
+        Loaded data.
     """
-    if ctx is None:
-        ctx = Context.default_ctx
-    if dtype is None:
-        dtype = mx_real_t
-    # pylint: disable= no-member, protected-access
-    return _internal._ones(shape=shape, ctx=ctx, dtype=dtype)
-    # pylint: enable= no-member, protected-access
+    if not isinstance(fname, string_types):
+        raise TypeError('fname required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(fname),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), NDArray(NDArrayHandle(handles[i]))) for i in range(out_size.value))
 
-def full(shape, val, ctx=None, dtype=None):
-    """Create a new NDArray filled with given value, with specified shape.
 
-    Parameters
-    ----------
-    shape : tuple
-        shape of the NDArray.
-    val : float or int
-        value to be filled with.
-    ctx : Context, optional
-        The context of the NDArray, default to current default context.
-    dtype : str or numpy.dtype, optional
-        The value type of the NDArray, default to np.float32
+def save(fname, data):
+    """Save a list of arrays of a str->array dict into file.
 
-    Returns
-    -------
-    out: Array
-        The created NDArray.
-    """
-    if dtype is None:
-        dtype = mx_real_t
-    arr = empty(shape, ctx, dtype)
-    arr[:] = val
-    return arr
+    Examples of filenames:
 
-def array(source_array, ctx=None, dtype=None):
-    """Create a new NDArray that copies content from source_array.
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
 
     Parameters
     ----------
-    source_array : array_like
-        Source data to create NDArray from.
-    ctx : Context, optional
-        The context of the NDArray, default to current default context.
-    dtype : str or numpy.dtype, optional
-        The value type of the NDArray, default to np.float32
-
-    Returns
-    -------
-    out: Array
-        The created NDArray.
+    fname : str
+        The filename.
+    data : list of ``NDArray` or dict of str to ``NDArray``
+        The data for saving.
+
+    Examples
+    --------
+    >>> x = mx.nd.zeros((2,3))
+    >>> y = mx.nd.ones((1,4))
+    >>> mx.nd.save('my_list', [x,y])
+    >>> mx.nd.save('my_dict', {'x':x, 'y':y})
+    >>> mx.nd.load('my_list')
+    [<NDArray 2x3 @cpu(0)>, <NDArray 1x4 @cpu(0)>]
+    >>> mx.nd.load('my_dict')
+    {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
     """
-    if dtype is None:
-        dtype = mx_real_t
-    if not isinstance(source_array, np.ndarray):
-        try:
-            source_array = np.array(source_array, dtype=dtype)
-        except:
-            raise TypeError('source_array must be array like object')
-    arr = empty(source_array.shape, ctx, dtype)
-    arr[:] = source_array
-    return arr
+    handles = []
+    if isinstance(data, dict):
+        keys = []
+        for key, val in data.items():
+            if not isinstance(key, string_types):
+                raise TypeError('save only accept dict str->NDArray or list of NDArray')
+            if not isinstance(val, NDArray):
+                raise TypeError('save only accept dict str->NDArray or list of NDArray')
+            keys.append(c_str(key))
+            handles.append(val.handle)
+        keys = c_array(ctypes.c_char_p, keys)
+    else:
+        for val in data:
+            if not isinstance(val, NDArray):
+                raise TypeError('save only accept dict str->NDArray or list of NDArray')
+            handles.append(val.handle)
+        keys = None
+    check_call(_LIB.MXNDArraySave(c_str(fname),
+                                  mx_uint(len(handles)),
+                                  c_array(NDArrayHandle, handles),
+                                  keys))
+
 
 def concatenate(arrays, axis=0, always_copy=True):
-    """Concatenate a list of NDArrays along the first dimension.
+    """DEPRECATED, use ``concat`` instead
 
     Parameters
     ----------
-    arrays : list of NDArray
+    arrays : list of `NDArray`
         Arrays to be concatenate. They must have identical shape except
         the first dimension. They also must have the same data type.
     axis : int
@@ -1158,7 +1876,8 @@ def concatenate(arrays, axis=0, always_copy=True):
 
     Returns
     -------
-    An `NDArray` that lives on the same context as `arrays[0].context`.
+    NDArray
+        An `NDArray` that lives on the same context as `arrays[0].context`.
     """
     assert isinstance(arrays, list)
     assert len(arrays) > 0
@@ -1197,143 +1916,23 @@ def concatenate(arrays, axis=0, always_copy=True):
 
     return ret
 
-# pylint: disable= no-member, protected-access, too-many-arguments
-def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=None):
-    """Simlar function in the MXNet ndarray as numpy.arange
-        See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html.
-
-    Parameters
-    ----------
-    start : number, optional
-        Start of interval. The interval includes this value. The default start value is 0.
-    stop : number, optional
-        End of interval. The interval does not include this value.
-    step : number, optional
-        Spacing between values
-    repeat : number, optional
-        "The repeating time of all elements.
-        E.g repeat=3, the element a will be repeated three times --> a, a, a.
-    ctx : Context, optional
-        The context of the NDArray, default to current default context.
-    dtype : str or numpy.dtype, optional
-        The value type of the NDArray, default to np.float32
-
-    Returns
-    -------
-    out : NDArray
-        The created NDArray
-    """
-    if ctx is None:
-        ctx = Context.default_ctx
-    if dtype is None:
-        dtype = mx_real_t
-    return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
-                             dtype=dtype, ctx=str(ctx))
-# pylint: enable= no-member, protected-access, too-many-arguments
-
-
-def load(fname):
-    """Load ndarray from binary file.
-
-    You can also use pickle to do the job if you only work on python.
-    The advantage of load/save is the file is language agnostic.
-    This means the file saved using save can be loaded by other language binding of mxnet.
-    You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
-
-    Parameters
-    ----------
-    fname : str
-        The name of the file.Can be S3 or HDFS address (remember built with S3 support).
-        Example of fname:
-
-        - `s3://my-bucket/path/my-s3-ndarray`
-        - `hdfs://my-bucket/path/my-hdfs-ndarray`
-        - `/path-to/my-local-ndarray`
-
-    Returns
-    -------
-    out : list of NDArray or dict of str to NDArray
-        List of NDArray or dict of str->NDArray, depending on what was saved.
-    """
-    if not isinstance(fname, string_types):
-        raise TypeError('fname need to be string')
-    out_size = mx_uint()
-    out_name_size = mx_uint()
-    handles = ctypes.POINTER(NDArrayHandle)()
-    names = ctypes.POINTER(ctypes.c_char_p)()
-    check_call(_LIB.MXNDArrayLoad(c_str(fname),
-                                  ctypes.byref(out_size),
-                                  ctypes.byref(handles),
-                                  ctypes.byref(out_name_size),
-                                  ctypes.byref(names)))
-    if out_name_size.value == 0:
-        return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
-    else:
-        assert out_name_size.value == out_size.value
-        return dict(
-            (py_str(names[i]), NDArray(NDArrayHandle(handles[i]))) for i in range(out_size.value))
-
-
-def save(fname, data):
-    """Save list of NDArray or dict of str->NDArray to binary file.
-
-    You can also use pickle to do the job if you only work on python.
-    The advantage of load/save is the file is language agnostic.
-    This means the file saved using save can be loaded by other language binding of mxnet.
-    You also get the benefit being able to directly load/save from cloud storage(S3, HDFS)
-
-    Parameters
-    ----------
-    fname : str
-        The name of the file.Can be S3 or HDFS address (remember built with S3 support).
-        Example of fname:
-
-        - `s3://my-bucket/path/my-s3-ndarray`
-        - `hdfs://my-bucket/path/my-hdfs-ndarray`
-        - `/path-to/my-local-ndarray`
-
-    data : list of NDArray or dict of str to NDArray
-        The data to be saved.
-    """
-    handles = []
-    if isinstance(data, dict):
-        keys = []
-        for key, val in data.items():
-            if not isinstance(key, string_types):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            keys.append(c_str(key))
-            handles.append(val.handle)
-        keys = c_array(ctypes.c_char_p, keys)
-    else:
-        for val in data:
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            handles.append(val.handle)
-        keys = None
-    check_call(_LIB.MXNDArraySave(c_str(fname),
-                                  mx_uint(len(handles)),
-                                  c_array(NDArrayHandle, handles),
-                                  keys))
-
 def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mean=None):
-    """Decode an image from string. Requires OpenCV to work.
+    """DEPRECATED, use mx.img instead
 
     Parameters
     ----------
     str_img : str
-        binary image data
+        Binary image data
     clip_rect : iterable of 4 int
-        clip decoded image to rectangle (x0, y0, x1, y1)
+        Clip decoded image to rectangle (x0, y0, x1, y1).
     out : NDArray
-        output buffer. can be 3 dimensional (c, h, w) or 4 dimensional (n, c, h, w)
+        Output buffer. Can be 3 dimensional (c, h, w) or 4 dimensional (n, c, h, w).
     index : int
-        output decoded image to i-th slice of 4 dimensional buffer
+        Output decoded image to i-th slice of 4 dimensional buffer.
     channels : int
-        number of channels to output. Decode to grey scale when channels = 1.
+        Number of channels to output. Decode to grey scale when channels = 1.
     mean : NDArray
-        subtract mean from decode image before outputing.
+        Subtract mean from decode image before outputing.
     """
     # pylint: disable= no-member, protected-access, too-many-arguments
     if mean is None:
@@ -1357,3 +1956,6 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea
                                    len(str_img),
                                    str_img=str_img,
                                    out=out)
+
+# from .base import add_fileline_to_docstring
+# add_fileline_to_docstring(__name__)
diff --git a/python/mxnet/ndarray_doc.py b/python/mxnet/ndarray_doc.py
index 11b2f9926b48..1ca3c2ec4635 100644
--- a/python/mxnet/ndarray_doc.py
+++ b/python/mxnet/ndarray_doc.py
@@ -19,8 +19,8 @@ def _build_doc(func_name,
                ret_type=None):
     """Build docstring for imperative functions."""
     param_str = _build_param_doc(arg_names, arg_types, arg_desc)
-    if key_var_num_args:
-        desc += '\nThis function support variable length of positional input.'
+    # if key_var_num_args:
+    #     desc += '\nThis function support variable length of positional input.'
     doc_str = ('%s\n\n' +
                '%s\n' +
                'out : NDArray, optional\n' +
@@ -33,4 +33,6 @@ def _build_doc(func_name,
     extra_doc = "\n" + '\n'.join([x.__doc__ for x in type.__subclasses__(NDArrayDoc)
                                   if x.__name__ == '%sDoc' % func_name])
     doc_str += _re.sub(_re.compile("    "), "", extra_doc)
+    doc_str = _re.sub('ndarray-or-symbol', 'NDArray', doc_str)
+
     return doc_str
diff --git a/python/mxnet/notebook/callback.py b/python/mxnet/notebook/callback.py
index 8b94a28e3cbe..2f923348d1d7 100644
--- a/python/mxnet/notebook/callback.py
+++ b/python/mxnet/notebook/callback.py
@@ -380,5 +380,3 @@ def args_wrapper(*args):
         for k, v in callback_args.iteritems():
             out[k].append(v)
     return dict(out)
-
-
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 6d9584322b76..a08e764088a5 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -12,17 +12,17 @@
 from .base import _LIB, check_call
 from .base import c_array, c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol
-from .ndarray import NDArray
+from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 
 c_int_p = POINTER(c_int)
 
 class PythonOp(object):
-    """Base class for operators implemented in python
+    """Base class for operators implemented in Python.
 
     Parameters
     ----------
     need_top_grad : bool
-        the default need_top_grad() function returns this value
+        the default need_top_grad() function returns this value.
     """
     _ref_holder = []
 
@@ -35,13 +35,13 @@ def __call__(self, *args, **kwargs):
 
     def get_symbol(self, *args, **kwargs):
         """Create a symbol from numpy operator.
-        This Should only be called once per instance if operator contains
+        This should only be called once per instance if the operator contains
         internal states.
 
         Parameters
         ----------
         args : list
-            a list of input arguments (symbols)
+            a list of input arguments (symbols).
 
         Returns
         -------
@@ -50,7 +50,7 @@ def get_symbol(self, *args, **kwargs):
         raise NotImplementedError("Must override this")
 
     def forward(self, in_data, out_data):
-        """forward interface. override to create new operators
+        """Forward interface. Override to create new operators.
 
         Parameters
         ----------
@@ -61,7 +61,7 @@ def forward(self, in_data, out_data):
         out_data[0][:] = in_data[0]
 
     def backward(self, out_grad, in_data, out_data, in_grad):
-        """backward interface. override to create new operators
+        """Backward interface. Can override when creating new operators.
 
         Parameters
         ----------
@@ -73,36 +73,36 @@ def backward(self, out_grad, in_data, out_data, in_grad):
         in_grad[0][:] = 1.0
 
     def infer_shape(self, in_shape):
-        """infer_shape interface. override to create new operators
+        """Interface for ``infer_shape``. Can override when creating new operators.
 
         Parameters
         ----------
         in_shape : list
-            list of argument shapes in the same order as
+            List of argument shapes in the same order as
             declared in list_arguments.
 
         Returns
         -------
         in_shape : list
-            list of argument shapes. Can be modified from in_shape.
+            List of argument shapes. Can be modified from in_shape.
         out_shape : list
-            list of output shapes calculated from in_shape,
+            List of output shapes calculated from in_shape,
             in the same order as declared in list_arguments.
         """
         return in_shape, [in_shape[0]]
 
     def list_outputs(self):
-        """list_outputs interface. override to create new operators
+        """Interface for ``list_outputs``. Can override when creating new operators.
 
         Returns
         -------
         outputs : list
-            list of output blob names.
+            List of output blob names.
         """
         return ['output']
 
     def list_arguments(self):
-        """list_arguments interface. override to create new operators
+        """Interface for ``list_arguments``. Can override when creating new operators.
 
         Returns
         -------
@@ -399,7 +399,7 @@ def __init__(self):
         pass
 
     def forward(self, is_train, req, in_data, out_data, aux):
-        """forward interface. override to create new operators
+        """Forward interface. Can override when creating new operators.
 
         Parameters
         ----------
@@ -416,7 +416,7 @@ def forward(self, is_train, req, in_data, out_data, aux):
         pass
 
     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-        """backward interface. override to create new operators
+        """Backward interface. Can override when creating new operators.
 
         Parameters
         ----------
@@ -441,61 +441,83 @@ def assign(self, dst, req, src):
 
 class CustomOpProp(object):
     """Base class for operator property class implemented in python.
-    MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
 
     Parameters
     ----------
     need_top_grad : bool
-        The default declare_backward_dependency function use this value
-        to determine whether this operator needs gradient input for above.
+        The default declare_backward_dependency function. Use this value
+        to determine whether this operator needs gradient input.
     """
     def __init__(self, need_top_grad=False):
         self.need_top_grad_ = need_top_grad
 
     def infer_shape(self, in_shape):
-        """infer_shape interface. override to create new operators
+        """infer_shape interface. Can override when creating new operators.
 
         Parameters
         ----------
         in_shape : list
-            list of argument shapes in the same order as
+            List of argument shapes in the same order as
             declared in list_arguments.
 
         Returns
         -------
         in_shape : list
-            list of argument shapes. Can be modified from in_shape.
+            List of argument shapes. Can be modified from in_shape.
         out_shape : list
-            list of output shapes calculated from in_shape,
+            List of output shapes calculated from in_shape,
             in the same order as declared in list_outputs.
         aux_shape : Optional, list
-            list of aux shapes calculated from in_shape,
+            List of aux shapes calculated from in_shape,
             in the same order as declared in list_auxiliary_states.
         """
         return in_shape, [in_shape[0]], []
 
+    def infer_type(self, in_type):
+        """infer_type interface. override to create new operators
+
+        Parameters
+        ----------
+        in_type : list of np.dtype
+            list of argument types in the same order as
+            declared in list_arguments.
+
+        Returns
+        -------
+        in_type : list
+            list of argument types. Can be modified from in_type.
+        out_type : list
+            list of output types calculated from in_type,
+            in the same order as declared in list_outputs.
+        aux_type : Optional, list
+            list of aux types calculated from in_type,
+            in the same order as declared in list_auxiliary_states.
+        """
+        return in_type, [in_type[0]]*len(self.list_outputs()), \
+            [in_type[0]]*len(self.list_auxiliary_states())
+
     def list_outputs(self):
-        """list_outputs interface. override to create new operators
+        """list_outputs interface. Can override when creating new operators.
 
         Returns
         -------
         outputs : list
-            list of output blob names.
+            List of output blob names.
         """
         return ['output']
 
     def list_arguments(self):
-        """list_arguments interface. override to create new operators
+        """list_arguments interface. Can override when creating new operators.
 
         Returns
         -------
         arguments : list
-            list of argument blob names.
+            List of argument blob names.
         """
         return ['data']
 
     def list_auxiliary_states(self):
-        """list_auxiliary_states interface. override to create new operators
+        """list_auxiliary_states interface. Can override when creating new operators.
 
         Returns
         -------
@@ -535,14 +557,14 @@ def create_operator(self, ctx, in_shapes, in_dtypes):
         return CustomOp()
 
 class _Registry(object):
-    """CustomOp registry"""
+    """CustomOp registry."""
     def __init__(self):
         self.ref_holder = {}
         self.counter = 0
         self.lock = Lock()
 
     def inc(self):
-        """Get index for new entry"""
+        """Get index for new entry."""
         self.lock.acquire()
         cur = self.counter
         self.counter += 1
@@ -555,47 +577,29 @@ def register(reg_name):
     """Register a subclass of CustomOpProp to the registry with name reg_name."""
     def do_register(prop_cls):
         """Register a subclass of CustomOpProp to the registry."""
-        fb_functype = CFUNCTYPE(c_bool, c_int, POINTER(c_void_p), POINTER(c_int),
-                                POINTER(c_int), c_bool, c_void_p)
-        del_functype = CFUNCTYPE(c_bool, c_void_p)
-        class CustomOpInfo(Structure):
-            """Structure that holds Callback information. Passed to CustomOpProp"""
+
+        class MXCallbackList(Structure):
+            """Structure that holds Callback information. Passed to CustomOpProp."""
             _fields_ = [
-                ('forward', fb_functype),
-                ('backward', fb_functype),
-                ('delete', del_functype),
-                ('p_forward', c_void_p),
-                ('p_backward', c_void_p),
-                ('p_delete', c_void_p)
+                ('num_callbacks', c_int),
+                ('callbacks', POINTER(CFUNCTYPE(c_int))),
+                ('contexts', POINTER(c_void_p))
                 ]
 
-        infer_functype = CFUNCTYPE(c_bool, c_int, POINTER(c_int),
-                                   POINTER(POINTER(mx_uint)), c_void_p)
-        list_functype = CFUNCTYPE(c_bool, POINTER(POINTER(POINTER(c_char))), c_void_p)
-        deps_functype = CFUNCTYPE(c_bool, c_int_p, c_int_p, c_int_p,
+        fb_functype = CFUNCTYPE(c_int, c_int, POINTER(c_void_p), POINTER(c_int),
+                                POINTER(c_int), c_int, c_void_p)
+        del_functype = CFUNCTYPE(c_int, c_void_p)
+
+        infershape_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int),
+                                        POINTER(POINTER(mx_uint)), c_void_p)
+        infertype_functype = CFUNCTYPE(c_int, c_int, POINTER(c_int), c_void_p)
+        list_functype = CFUNCTYPE(c_int, POINTER(POINTER(POINTER(c_char))), c_void_p)
+        deps_functype = CFUNCTYPE(c_int, c_int_p, c_int_p, c_int_p,
                                   c_int_p, POINTER(c_int_p), c_void_p)
-        createop_functype = CFUNCTYPE(c_bool, c_char_p, c_int, POINTER(POINTER(mx_uint)),
+        createop_functype = CFUNCTYPE(c_int, c_char_p, c_int, POINTER(POINTER(mx_uint)),
                                       POINTER(c_int), POINTER(c_int),
-                                      POINTER(CustomOpInfo), c_void_p)
-        class CustomOpPropInfo(Structure):
-            """Structure that holds Callback information. Passed to CustomOpProp"""
-            _fields_ = [
-                ('list_arguments', list_functype),
-                ('list_outputs', list_functype),
-                ('infer_shape', infer_functype),
-                ('declare_backward_dependency', deps_functype),
-                ('create_operator', createop_functype),
-                ('list_auxiliary_states', list_functype),
-                ('delete', del_functype),
-                ('p_list_arguments', c_void_p),
-                ('p_list_outputs', c_void_p),
-                ('p_infer_shape', c_void_p),
-                ('p_declare_backward_dependency', c_void_p),
-                ('p_create_operator', c_void_p),
-                ('p_list_auxiliary_states', c_void_p),
-                ('p_delete', c_void_p)
-                ]
-        req_enum = ['null', 'write', 'inplace', 'add']
+                                      POINTER(MXCallbackList), c_void_p)
+        req_enum = ('null', 'write', 'inplace', 'add')
 
         def creator(op_type, argc, keys, vals, ret):
             """internal function"""
@@ -605,7 +609,7 @@ def creator(op_type, argc, keys, vals, ret):
 
             def infer_shape_entry(num_tensor, tensor_dims,
                                   tensor_shapes, _):
-                """C Callback for CustomOpProp::InferShape"""
+                """C Callback for ``CustomOpProp::InferShape``."""
                 try:
                     n_in = len(op_prop.list_arguments())
                     n_out = len(op_prop.list_outputs())
@@ -636,6 +640,36 @@ def infer_shape_entry(num_tensor, tensor_dims,
                     return False
                 return True
 
+            def infer_type_entry(num_tensor, tensor_types, _):
+                """C Callback for CustomOpProp::InferType"""
+                try:
+                    n_in = len(op_prop.list_arguments())
+                    n_out = len(op_prop.list_outputs())
+                    n_aux = len(op_prop.list_auxiliary_states())
+                    assert num_tensor == n_in + n_out + n_aux
+
+                    types = [_DTYPE_MX_TO_NP[tensor_types[i]] for i in range(n_in)]
+                    ret = op_prop.infer_type(types)
+                    if len(ret) == 2:
+                        itype, otype = ret
+                        atype = []
+                    elif len(ret) == 3:
+                        itype, otype, atype = ret
+                    else:
+                        raise AssertionError("infer_type must return 2 or 3 lists")
+                    assert len(otype) == n_out
+                    assert len(itype) == n_in
+                    assert len(atype) == n_aux
+                    rtype = list(itype) + list(otype) + list(atype)
+                    for i, dtype in enumerate(rtype):
+                        tensor_types[i] = _DTYPE_NP_TO_MX[dtype]
+
+                    infer_type_entry._ref_holder = [tensor_types]
+                except Exception:
+                    print('Error in %s.infer_type: %s' % (reg_name, traceback.format_exc()))
+                    return False
+                return True
+
             def list_outputs_entry(out, _):
                 """C Callback for CustomOpProp::ListOutputs"""
                 try:
@@ -762,10 +796,16 @@ def delete_entry(_):
                             return False
                         return True
 
-                    ret[0] = CustomOpInfo(fb_functype(forward_entry),
-                                          fb_functype(backward_entry),
-                                          del_functype(delete_entry),
-                                          None, None, None)
+                    callbacks = [del_functype(delete_entry),
+                                 fb_functype(forward_entry),
+                                 fb_functype(backward_entry)]
+                    callbacks = [cast(i, CFUNCTYPE(c_int)) for i in callbacks]
+                    contexts = [None, None, None]
+                    ret[0] = MXCallbackList(c_int(len(callbacks)),
+                                            cast(c_array(CFUNCTYPE(c_int), callbacks),
+                                                 POINTER(CFUNCTYPE(c_int))),
+                                            cast(c_array(c_void_p, contexts),
+                                                 POINTER(c_void_p)))
                     op._ref_holder = [ret]
                     _registry.ref_holder[cur] = op
                 except Exception:
@@ -784,20 +824,27 @@ def delete_entry(_):
                     return False
                 return True
 
-            ret[0] = CustomOpPropInfo(list_functype(list_arguments_entry),
-                                      list_functype(list_outputs_entry),
-                                      infer_functype(infer_shape_entry),
-                                      deps_functype(declare_backward_dependency_entry),
-                                      createop_functype(create_operator_entry),
-                                      list_functype(list_auxiliary_states_entry),
-                                      del_functype(delete_entry),
-                                      None, None, None, None, None, None, None)
+            callbacks = [del_functype(delete_entry),
+                         list_functype(list_arguments_entry),
+                         list_functype(list_outputs_entry),
+                         list_functype(list_auxiliary_states_entry),
+                         infershape_functype(infer_shape_entry),
+                         deps_functype(declare_backward_dependency_entry),
+                         createop_functype(create_operator_entry),
+                         infertype_functype(infer_type_entry)]
+            callbacks = [cast(i, CFUNCTYPE(c_int)) for i in callbacks]
+            contexts = [None]*len(callbacks)
+            ret[0] = MXCallbackList(c_int(len(callbacks)),
+                                    cast(c_array(CFUNCTYPE(c_int), callbacks),
+                                         POINTER(CFUNCTYPE(c_int))),
+                                    cast(c_array(c_void_p, contexts),
+                                         POINTER(c_void_p)))
             op_prop._ref_holder = [ret]
             _registry.ref_holder[cur] = op_prop
             return True
 
-        creator_functype = CFUNCTYPE(c_bool, c_char_p, c_int, POINTER(c_char_p),
-                                     POINTER(c_char_p), POINTER(CustomOpPropInfo))
+        creator_functype = CFUNCTYPE(c_int, c_char_p, c_int, POINTER(c_char_p),
+                                     POINTER(c_char_p), POINTER(MXCallbackList))
         creator_func = creator_functype(creator)
         check_call(_LIB.MXCustomOpRegister(c_str(reg_name), creator_func))
         cur = _registry.inc()
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index ad771cde1811..1c0bfa982235 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -1,58 +1,43 @@
-# pylint: disable=fixme, invalid-name, unused-argument, too-many-arguments, no-name-in-module
-"""Common Optimization algorithms with regularizations."""
+"""Weight updating functions"""
 import math
 import pickle
+import logging
 from .ndarray import NDArray, zeros, clip, sqrt
-from .ndarray import sgd_update, sgd_mom_update, adam_update, rmsprop_update
+from .ndarray import sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update
 from .random import normal
 
 
 class Optimizer(object):
-    """Base class of all optimizers."""
-    opt_registry = {}
+    """The base class inherited by all optimizers.
 
-    @staticmethod
-    def register(klass):
-        """Register optimizers to the optimizer factory"""
-        assert(isinstance(klass, type))
-        name = klass.__name__.lower()
-        if name in Optimizer.opt_registry:
-            print('WARNING: New optimizer %s.%s is overriding '
-                  'existing optimizer %s.%s' % (
-                      klass.__module__, klass.__name__,
-                      Optimizer.opt_registry[name].__module__,
-                      Optimizer.opt_registry[name].__name__))
-        Optimizer.opt_registry[name] = klass
-        return klass
+    Parameters
+    ----------
+    rescale_grad : float, optional
+        Multiply the gradient with ``rescale_grad`` before updating. Often
+        choose to be ``1.0/batch_size``.
 
-    @staticmethod
-    def create_optimizer(name, rescale_grad=1, **kwargs):
-        """Create an optimizer with specified name.
+    param_idx2name : dict from int to string, optional
+        A dictionary that maps int index to string name.
 
-        Parameters
-        ----------
-        name: str
-            Name of required optimizer. Should be the name
-            of a subclass of Optimizer. Case insensitive.
+    clip_gradient : float, optional
+        Clip the gradient by projecting onto the box ``[-clip_gradient, clip_gradient]``.
 
-        rescale_grad : float
-            Rescaling factor on gradient. Normally should be 1/batch_size.
+    learning_rate : float, optional
+        The initial learning rate.
 
-        kwargs: dict
-            Parameters for optimizer
+    lr_scheduler : LRScheduler, optional
+        The learning rate scheduler.
 
-        Returns
-        -------
-        opt : Optimizer
-            The result optimizer.
-        """
-        if name.lower() in Optimizer.opt_registry:
-            return Optimizer.opt_registry[name.lower()](
-                rescale_grad=rescale_grad,
-                **kwargs)
-        else:
-            raise ValueError('Cannot find optimizer %s' % name)
+    wd : float, optional
+        The weight decay (or L2 regularization) coefficient. Modifies objective
+        by adding a penalty for having large weights.
 
+    sym: Symbol, optional
+        The Symbol this optimizer is applying to.
+
+    begin_num_update : int, optional
+        The initial number of updates
+    """
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=0.01,
                  lr_scheduler=None, sym=None, begin_num_update=0):
@@ -80,26 +65,122 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.set_lr_mult({})
         self.set_wd_mult({})
 
+    opt_registry = {}
+
+    @staticmethod
+    def register(klass):
+        """Register a new optimizer.
+
+        Once an optimizer is registered, we can create an instance of this
+        optimizer with ``create_optimizer`` later.
+
+        Examples
+        --------
+
+        >>> @mx.optimizer.Optimizer.register
+        ... class MyOptimizer(mx.optimizer.Optimizer):
+        ...     pass
+        >>> optim = mx.optimizer.Optimizer.create_optimizer('MyOptimizer')
+        >>> print(type(optim))
+        <class '__main__.MyOptimizer'>
+        """
+        assert(isinstance(klass, type))
+        name = klass.__name__.lower()
+        if name in Optimizer.opt_registry:
+            logging.warning('WARNING: New optimizer %s.%s is overriding '
+                            'existing optimizer %s.%s',
+                            klass.__module__, klass.__name__,
+                            Optimizer.opt_registry[name].__module__,
+                            Optimizer.opt_registry[name].__name__)
+        Optimizer.opt_registry[name] = klass
+        return klass
+
+    @staticmethod
+    def create_optimizer(name, **kwargs):
+        """Instantiate an optimizer with a given name and kwargs.
+
+        Notes
+        -----
+        We can use the alias ``create`` for ``Optimizer.create_optimizer``
+
+        Parameters
+        ----------
+        name: str
+            Name of the optimizer. Should be the name
+            of a subclass of Optimizer. Case insensitive.
+
+        kwargs: dict
+            Parameters for the optimizer.
+
+        Returns
+        -------
+        Optimizer
+            An instantiated optimizer.
+
+        Examples
+        --------
+        >>> sgd = mx.optimizer.Optimizer.create_optimizer('sgd')
+        >>> type(sgd)
+        <class 'mxnet.optimizer.SGD'>
+        >>> adam = mx.optimizer.create('adam', learning_rate=.1)
+        >>> type(adam)
+        <class 'mxnet.optimizer.Adam'>
+        """
+        if name.lower() in Optimizer.opt_registry:
+            return Optimizer.opt_registry[name.lower()](**kwargs)
+        else:
+            raise ValueError('Cannot find optimizer %s' % name)
+
+
     def create_state(self, index, weight):
-        """Create additional optimizer state such as momentum.
-        override in implementations."""
+        """Create auxiliary state for a given weight
+
+        Some optimizers require additional states, e.g. as momentum, in addition
+        to gradients in order to update weights. This function creates state
+        for a given weight which will be used in ``update``. This function is
+        called only once for each weight.
+
+        Parameters
+        ----------
+        index : int
+            An unique index to identify the weight.
+        weight : NDArray
+            The weight
+
+        Returns
+        -------
+        state : any obj
+            The state associated with the weight.
+        """
 
     def update(self, index, weight, grad, state):
-        """Update the parameters. override in implementations"""
+        """Update the weight given the corresponding gradient and state.
+
+        Parameters
+        ----------
+        index : int
+            An unique index to identify the weight.
+        weight : NDArray
+            The weight
+        grad : NDArray
+            The gradient of the objective with respect to this weight.
+        state : any obj
+            The state associated with this weight.
+        """
+        raise NotImplementedError()
 
-    # pylint: disable=no-self-use
-    def set_lr_scale(self, args_lrscale):
-        """set lr scale is deprecated. Use set_lr_mult instead."""
+    def set_lr_scale(self, args_lrscale): # pylint: disable=unused-argument
+        """[DEPRECATED] set lr scale. Use set_lr_mult instead."""
         raise DeprecationWarning
 
     def set_lr_mult(self, args_lr_mult):
-        """Set individual learning rate multipler for parameters
+        """Set individual learning rate for each weight.
 
         Parameters
         ----------
         args_lr_mult : dict of string/int to float
-            set the lr multipler for name/index to float.
-            setting multipler by index is supported for backward compatibility,
+            Set the lr multipler for name/index to float.
+            Setting multipler by index is supported for backward compatibility,
             but we recommend using name and symbol.
         """
         self.lr_mult = {}
@@ -111,15 +192,16 @@ def set_lr_mult(self, args_lr_mult):
         self.lr_mult.update(args_lr_mult)
 
     def set_wd_mult(self, args_wd_mult):
-        """Set individual weight decay multipler for parameters.
+        """Set individual weight decay for each weight.
+
         By default wd multipler is 0 for all params whose name doesn't
         end with _weight, if param_idx2name is provided.
 
         Parameters
         ----------
         args_wd_mult : dict of string/int to float
-            set the wd multipler for name/index to float.
-            setting multipler by index is supported for backward compatibility,
+            Set the wd multipler for name/index to float.
+            Setting multipler by index is supported for backward compatibility,
             but we recommend using name and symbol.
         """
         self.wd_mult = {}
@@ -134,12 +216,11 @@ def set_wd_mult(self, args_wd_mult):
         self.wd_mult.update(args_wd_mult)
 
     def _update_count(self, index):
-        """
-        update num_update
+        """Update num_update
 
         Parameters:
         index : int
-            The index will be updated
+            The index to be updated.
         """
         if index not in self._index_update_count:
             self._index_update_count[index] = self.begin_num_update
@@ -147,17 +228,17 @@ def _update_count(self, index):
         self.num_update = max(self._index_update_count[index], self.num_update)
 
     def _get_lr(self, index):
-        """get learning rate for index.
+        """Get the learning rate given the index of the weight.
 
         Parameters
         ----------
         index : int
-            The index for weight
+            The index corresponding to the weight.
 
         Returns
         -------
         lr : float
-            learning rate for this index
+            Learning rate for this index.
         """
         if self.lr_scheduler is not None:
             lr = self.lr_scheduler(self.num_update)
@@ -177,12 +258,12 @@ def _get_wd(self, index):
         Parameters
         ----------
         index : int
-            The index for weight
+            The index for weight.
 
         Returns
         -------
         wd : float
-            weight decay for this index
+            Weight decay for this index.
         """
         wd = self.wd
         if index in self.wd_mult:
@@ -192,32 +273,24 @@ def _get_wd(self, index):
         return wd
 
 # convenience wrapper for Optimizer.Register
-register = Optimizer.register
-
+register = Optimizer.register   # pylint: disable=invalid-name
 
 @register
 class SGD(Optimizer):
-    """A very simple SGD optimizer with momentum and weight regularization.
+    """The SGD optimizer with momentum and weight decay.
 
-    Parameters
-    ----------
-    learning_rate : float, optional
-        learning_rate of SGD
+    The optimizer updates the weight by:
 
-    momentum : float, optional
-       momentum value
+      state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
+      weight = weight - state
 
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`:
 
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-
-    param_idx2name : dict of string/int to float, optional
-        special treat weight decay in parameter ends with bias, gamma, and beta
+    Parameters
+    ----------
+    momentum : float, optional
+       The momentum value.
     """
     def __init__(self, momentum=0.0, **kwargs):
         super(SGD, self).__init__(**kwargs)
@@ -229,36 +302,12 @@ def __init__(self, momentum=0.0, **kwargs):
             self.kwargs['clip_gradient'] = self.clip_gradient
 
     def create_state(self, index, weight):
-        """Create additional optimizer state such as momentum.
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-
-        """
         if self.momentum == 0.0:
             return None
         else:
             return zeros(weight.shape, weight.context, dtype=weight.dtype)
 
     def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         lr = self._get_lr(index)
@@ -274,33 +323,21 @@ def update(self, index, weight, grad, state):
 
 @register
 class DCASGD(Optimizer):
-    """ DCASGD optimizer with momentum and weight regularization.
+    """The DCASGD optimizer
+
+    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent with
+    Delay Compensation for Distributed Deep Learning*, available at https://arxiv.org/abs/1609.08326
 
-    implement paper "Asynchronous Stochastic Gradient Descent with
-                    Delay Compensation for Distributed Deep Learning"
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`:
 
     Parameters
     ----------
-    learning_rate : float, optional
-        learning_rate of SGD
-
     momentum : float, optional
-       momentum value
+       The momentum value.
 
     lamda : float, optional
-       scale DC value
-
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-
-    param_idx2name : dict of string/int to float, optional
-        special treat weight decay in parameter ends with bias, gamma, and beta
+       Scale DC value.
     """
     def __init__(self, momentum=0.0, lamda=0.04, **kwargs):
         super(DCASGD, self).__init__(**kwargs)
@@ -309,36 +346,14 @@ def __init__(self, momentum=0.0, lamda=0.04, **kwargs):
         self.lamda = lamda
 
     def create_state(self, index, weight):
-        """Create additional optimizer state such as momentum.
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-
-        """
         if self.momentum == 0.0:
-            return None
+            return (None,
+                    weight.copy())  # previous weight
         else:
-            return zeros(weight.shape, weight.context, dtype=weight.dtype)
+            return (zeros(weight.shape, weight.context, dtype=weight.dtype), # momentum
+                    weight.copy())  # previous weight
 
     def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         lr = self._get_lr(index)
@@ -349,53 +364,33 @@ def update(self, index, weight, grad, state):
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
 
-        if state:
-            mom = state
+        mom, previous_weight = state
+        if mom:
             mom[:] *= self.momentum
-            if self.weight_previous.has_key(index):
-                mom[:] += -lr * (grad + wd * weight + self.lamda \
-                                    * grad * grad * (weight - self.weight_previous[index]))
-                self.weight_previous[index] = weight
-            else:
-                mom[:] += -lr * (grad + wd * weight)
-                self.weight_previous[index] = weight
-            weight[:] += mom
+            mom[:] += -lr * (grad + wd * weight + self.lamda \
+                      * grad * grad * (weight - previous_weight))
         else:
-            assert self.momentum == 0.0
-            if self.weight_previous.has_key(index):
-                weight[:] += -lr * (grad + wd * weight + self.lamda \
-                                    * grad * grad * (weight - self.weight_previous[index]))
-                self.weight_previous[index] = weight
-            else:
-                weight[:] += -lr * (grad + wd * weight)
-                self.weight_previous[index] = weight
+            assert(self.momentum == 0.0)
+            mom = -lr * (grad + wd * weight + self.lamda \
+                      * grad * grad * (weight - previous_weight))
+        previous_weight[:] = weight
+        weight[:] += mom
 
 @register
 class NAG(SGD):
-    """SGD with nesterov
-    It is implemented according to
-    https://github.com/torch/optim/blob/master/sgd.lua
+    """Nesterov accelerated SGD.
+
+    This optimizer updates each weight by:
+
+        state = momentum * state + grad + wd * weight
+        weight = weight - (lr * (grad + momentum * state))
+
+    This optimizer accepts the same arguments as :class:`.SGD`.
     """
     def __init__(self, **kwargs):
         super(NAG, self).__init__(**kwargs)
 
     def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         lr = self._get_lr(index)
@@ -417,59 +412,22 @@ def update(self, index, weight, grad, state):
             assert self.momentum == 0.0
             weight[:] += -lr * (grad + wd * weight)
 
-
 @register
 class SGLD(Optimizer):
-    """Stochastic Langevin Dynamics Updater to sample from a distribution.
+    """Stochastic Gradient Riemannian Langevin Dynamics.
 
-    Parameters
-    ----------
-    learning_rate : float, optional
-        learning_rate of SGD
-
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
+    This class implements the optimizer described in the paper *Stochastic Gradient
+    Riemannian Langevin Dynamics on the Probability Simplex*, available at
+    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf
 
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-
-    param_idx2name : dict of string/int to float, optional
-        special treat weight decay in parameter ends with bias, gamma, and beta
     """
     def __init__(self, **kwargs):
         super(SGLD, self).__init__(**kwargs)
 
     def create_state(self, index, weight):
-        """Create additional optimizer state such as momentum.
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-
-        """
         return None
 
     def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         lr = self._get_lr(index)
@@ -483,7 +441,7 @@ def update(self, index, weight, grad, state):
                                                             weight.shape, weight.context)
 
 
-@register
+@register  # pylint: disable=invalid-name
 class ccSGD(SGD):
     """[Deprecated] Same as sgd. Left here for backward compatibility."""
     def __init__(self, *args, **kwargs):
@@ -491,41 +449,25 @@ def __init__(self, *args, **kwargs):
 
 @register
 class Adam(Optimizer):
-    """Adam optimizer as described in [King2014]_.
+    """The Adam optimizer.
 
-    .. [King2014] Diederik Kingma, Jimmy Ba,
-       *Adam: A Method for Stochastic Optimization*,
-       http://arxiv.org/abs/1412.6980
+    This class implements the optimizer described in *Adam: A Method for
+    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980
 
-    the code in this class was adapted from
-    https://github.com/mila-udem/blocks/blob/master/blocks/algorithms/__init__.py#L765
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`:
 
     Parameters
     ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.002.
     beta1 : float, optional
         Exponential decay rate for the first moment estimates.
-        Default value is set to 0.9.
     beta2 : float, optional
         Exponential decay rate for the second moment estimates.
-        Default value is set to 0.999.
     epsilon : float, optional
-        Default value is set to 1e-8.
-    decay_factor : float, optional
-        Default value is set to 1 - 1e-8.
-
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
+        Small value to avoid divided by 0.
     """
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 decay_factor=(1 - 1e-8), **kwargs):
+                 **kwargs):
         super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
@@ -535,34 +477,10 @@ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
             self.kwargs['clip_gradient'] = self.clip_gradient
 
     def create_state(self, index, weight):
-        """Create additional optimizer state: mean, variance
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-
-        """
         return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
                 zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
 
     def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         lr = self._get_lr(index)
@@ -573,38 +491,25 @@ def update(self, index, weight, grad, state):
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
-
         mean, var = state
         adam_update(weight, grad, mean, var, out=weight,
                     lr=lr, wd=wd, **self.kwargs)
 
-
 @register
 class AdaGrad(Optimizer):
-    """AdaGrad optimizer of Duchi et al., 2011,
+    """AdaGrad optimizer
 
-    This code follows the version in http://arxiv.org/pdf/1212.5701v1.pdf  Eq(5)
-    by Matthew D. Zeiler, 2012. AdaGrad will help the network to converge faster
-    in some cases.
+    This calss implements the AdaGrad optiizer described in *Adaptive Subgradient
+    Methods for Online Learning and Stochastic Optimization*, and available at
+    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`:
 
     Parameters
     ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.05.
-
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-
     eps: float, optional
-        A small float number to make the updating processing stable
-        Default value is set to 1e-7.
-
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
+        Small value to avoid division by 0.
     """
     def __init__(self, eps=1e-7, **kwargs):
         super(AdaGrad, self).__init__(**kwargs)
@@ -627,104 +532,91 @@ def update(self, index, weight, grad, state):
         history[:] += (grad * grad)
         weight[:] += -lr * (grad / sqrt(history + self.float_stable_eps) + wd * weight)
 
-
 @register
 class RMSProp(Optimizer):
-    """RMSProp optimizer of Tieleman & Hinton, 2012,
+    """The RMSProp optimizer.
+
+    Two versions of RMSProp are implemented:
 
-    This code follows the version in  http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45)
+    If ``centered=False``, we follow
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012.
+
+    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
     by Alex Graves, 2013.
 
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`:
+
     Parameters
     ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.002.
     gamma1: float, optional
-        decay factor of moving average for gradient, gradient^2.
-        Default value is set to 0.95.
+        Decay factor of moving average for ``gradient^2``.
     gamma2: float, optional
-        "momentum" factor.
-        Default value if set to 0.9.
+        A "momentum" factor. Only used if ``centered=True``.
     epsilon : float, optional
-        Default value is set to 1e-8.
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
+        Small value to avoid division by 0.
+    centered : bool, optional
+        Use Graves' or Tieleman & Hinton's version of RMSProp.
+    clip_weights : float, optional
+        clip weights into range ``[-clip_weights, clip_weights]``
     """
-    def __init__(self, learning_rate=0.001, gamma1=0.95, gamma2=0.9,
-                 epsilon=1e-8, **kwargs):
+    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
+                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
         super(RMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
         self.gamma1 = gamma1
         self.gamma2 = gamma2
-        self.kwargs = {'gamma1': gamma1, 'gamma2': gamma2, 'epsilon': epsilon,
+        self.centered = centered
+        self.clip_weights = clip_weights
+        self.kwargs = {'gamma1': gamma1, 'epsilon': epsilon,
                        'rescale_grad': self.rescale_grad}
+        if self.centered:
+            self.kwargs['gamma2'] = gamma2
         if self.clip_gradient:
             self.kwargs['clip_gradient'] = self.clip_gradient
+        if self.clip_weights:
+            self.kwargs['clip_weights'] = self.clip_weights
 
     def create_state(self, index, weight):
-        """Create additional optimizer state: mean, variance
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-
-        """
-        return (zeros(weight.shape, weight.context),  # n
+        if self.centered:
+            return (
+                zeros(weight.shape, weight.context),  # n
                 zeros(weight.shape, weight.context),  # g
                 zeros(weight.shape, weight.context))  # delta
+        else:
+            return (zeros(weight.shape, weight.context), )  # n
 
     def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
-        n, g, delta = state
-        rmsprop_update(weight, grad, n, g, delta, out=weight,
-                       lr=lr, wd=wd, **self.kwargs)
+        if not self.centered:
+            (n, ) = state
+            rmsprop_update(
+                weight, grad, n, out=weight, lr=lr, wd=wd, **self.kwargs)
+        else:
+            n, g, delta = state
+            rmspropalex_update(weight, grad, n, g, delta, out=weight,
+                               lr=lr, wd=wd, **self.kwargs)
 
 @register
 class AdaDelta(Optimizer):
-    """
-    AdaDelta optimizer as described in
-    Zeiler, M. D. (2012).
-    *ADADELTA: An adaptive learning rate method.*
+    """The AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701
 
-    http://arxiv.org/abs/1212.5701
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`:
 
     Parameters
     ----------
     rho: float
-        Decay rate for both squared gradients and delta x
+        Decay rate for both squared gradients and delta.
     epsilon : float
-        The constant as described in the thesis
-    wd : float
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient. Normally should be 1/batch_size.
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
+        Small value to avoid division by 0.
     """
     def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
         super(AdaDelta, self).__init__(**kwargs)
@@ -757,14 +649,11 @@ def update(self, index, weight, grad, state):
         # update weight
         weight[:] -= current_delta + wd * weight
 
-
 @register
 class Test(Optimizer):
-    """For test use"""
     def __init__(self, **kwargs):
         super(Test, self).__init__(**kwargs)
 
-    # pylint: disable=no-self-use
     def create_state(self, index, weight):
         """Create a state to duplicate weight"""
         return zeros(weight.shape, weight.context)
@@ -775,39 +664,39 @@ def update(self, index, weight, grad, state):
         state[:] = weight
 
 # backward compatibility wrapper for Optimizer.CreateOptimizer
-create = Optimizer.create_optimizer
+create = Optimizer.create_optimizer  # pylint: disable=invalid-name
 
 class Updater(object):
-    """updater for kvstore"""
+    """Updater for kvstore."""
     def __init__(self, optimizer):
         self.optimizer = optimizer
         self.states = {}
 
     def __call__(self, index, grad, weight):
-        """Update weight given gradient and index"""
+        """Update weight given gradient and index."""
         if index not in self.states:
             self.states[index] = self.optimizer.create_state(index, weight)
         self.optimizer.update(index, weight, grad, self.states[index])
 
     def set_states(self, states):
-        """set updater states"""
+        """Set updater states."""
         self.states = pickle.loads(states)
 
     def get_states(self):
-        """get updater states"""
+        """Get updater states."""
         return pickle.dumps(self.states)
 
 def get_updater(optimizer):
-    """Return a clossure of the updater needed for kvstore
+    """Return a clossure of the updater needed for kvstore.
 
     Parameters
     ----------
     optimizer: Optimizer
-         The optimizer
+         The optimizer.
 
     Returns
     -------
     updater: function
-         The clossure of the updater
+         The clossure of the updater.
     """
     return Updater(optimizer)
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index a3c60b68a2cf..93489878c78f 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -13,11 +13,10 @@ def profiler_set_config(mode='symbolic', filename='profile.json'):
     Parameters
     ----------
     mode : string, optional
-        Indicting whether to enable the profiler, can
-        be 'symbolic' or 'all'. Default is `symbolic`.
+        Indicates whether to enable the profiler, can
+        be 'symbolic', or 'all'. Defaults to `symbolic`.
     filename : string, optional
-        The name of output trace file. Default is
-        'profile.json'.
+        The name of output trace file. Defaults to 'profile.json'.
     """
     mode2int = {'symbolic': 0, 'all': 1}
     check_call(_LIB.MXSetProfilerConfig(
@@ -30,7 +29,7 @@ def profiler_set_state(state='stop'):
     Parameters
     ----------
     state : string, optional
-        Indicting whether to run the profiler, can
+        Indicates whether to run the profiler, can
         be 'stop' or 'run'. Default is `stop`.
     """
     state2int = {'stop': 0, 'run': 1}
@@ -38,5 +37,5 @@ def profiler_set_state(state='stop'):
 
 def dump_profile():
     """Dump profile and stop profiler. Use this to save profile
-    in advance in case your program cannot exit normally"""
+    in advance in case your program cannot exit normally."""
     check_call(_LIB.MXDumpProfile())
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index edd2bb4dc6e9..7ead8761f278 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 # pylint: disable=no-member, protected-access, unused-import, no-name-in-module
-"""Random Number interface of mxnet."""
+"""Random number interface of mxnet."""
 from __future__ import absolute_import
 
 import ctypes
@@ -9,11 +9,11 @@
 from ._ndarray_internal import _sample_normal as normal
 
 def seed(seed_state):
-    """Seed the random number generators in mxnet.
+    """Seed the random number generators in MXNet.
 
-    This seed will affect behavior of functions in this module,
-    as well as results from executors that contains Random number
-    such as Dropout operators.
+    This seed will affect behavior of functions in this module.
+    It also affects the results from executors that contain random numbers
+    such as dropout operators.
 
     Parameters
     ----------
@@ -22,8 +22,8 @@ def seed(seed_state):
 
     Notes
     -----
-    The random number generator of mxnet is by default device specific.
-    This means if you set the same seed, the random number sequence
+    The random number generator of MXNet is, by default, device-specific.
+    This means that if you set the same seed, the random number sequence
     generated from GPU0 can be different from CPU.
     """
     if not isinstance(seed_state, int):
diff --git a/python/mxnet/recordio.py b/python/mxnet/recordio.py
index 4a0b247122c3..2fe05c4d9e26 100644
--- a/python/mxnet/recordio.py
+++ b/python/mxnet/recordio.py
@@ -1,7 +1,4 @@
-# coding: utf-8
-# pylint: disable=invalid-name, protected-access, fixme, too-many-arguments, no-member
-
-"""Python interface for DLMC RecrodIO data format"""
+"""Read and write for the RecrodIO data format"""
 from __future__ import absolute_import
 from collections import namedtuple
 
@@ -20,7 +17,7 @@
     cv2 = None
 
 class MXRecordIO(object):
-    """Python interface for read/write RecordIO data formmat
+    """Read/write RecordIO formmat data.
 
     Parameters
     ----------
@@ -37,7 +34,7 @@ def __init__(self, uri, flag):
         self.open()
 
     def open(self):
-        """Open record file"""
+        """Open record file."""
         if self.flag == "w":
             check_call(_LIB.MXRecordIOWriterCreate(self.uri, ctypes.byref(self.handle)))
             self.writable = True
@@ -52,7 +49,7 @@ def __del__(self):
         self.close()
 
     def close(self):
-        """close record file"""
+        """Close record file."""
         if not self.is_open:
             return
         if self.writable:
@@ -63,17 +60,17 @@ def close(self):
 
     def reset(self):
         """Reset pointer to first item. If record is opened with 'w',
-        this will truncate the file to empty"""
+        this will truncate the file to empty."""
         self.close()
         self.open()
 
     def write(self, buf):
-        """Write a string buffer as a record
+        """Write a string buffer as a record.
 
         Parameters
         ----------
         buf : string (python2), bytes (python3)
-            buffer to write.
+            Buffer to write.
         """
         assert self.writable
         check_call(_LIB.MXRecordIOWriterWriteRecord(self.handle,
@@ -81,12 +78,12 @@ def write(self, buf):
                                                     ctypes.c_size_t(len(buf))))
 
     def read(self):
-        """Read a record as string
+        """Read a record as string.
 
         Returns
         ----------
         buf : string
-            buffer read.
+            Buffer read.
         """
         assert not self.writable
         buf = ctypes.c_char_p()
@@ -101,19 +98,18 @@ def read(self):
             return None
 
 class MXIndexedRecordIO(MXRecordIO):
-    """Python interface for read/write RecordIO data formmat with index.
-    Support random access.
+    """Read/write RecordIO formmat data supporting random access.
 
     Parameters
     ----------
     idx_path : str
-        Path to index file
+        Path to index file.
     uri : str
         Path to record file. Only support file types that are seekable.
     flag : str
         'w' for write or 'r' for read
     key_type : type
-        data type for keys
+        Data type for keys.
     """
     def __init__(self, idx_path, uri, flag, key_type=int):
         self.idx_path = idx_path
@@ -142,25 +138,25 @@ def close(self):
         self.fidx.close()
 
     def seek(self, idx):
-        """Query current read head position"""
+        """Query current read head position."""
         assert not self.writable
         pos = ctypes.c_size_t(self.idx[idx])
         check_call(_LIB.MXRecordIOReaderSeek(self.handle, pos))
 
     def tell(self):
-        """Query current write head position"""
+        """Query current write head position."""
         assert self.writable
         pos = ctypes.c_size_t()
         check_call(_LIB.MXRecordIOWriterTell(self.handle, ctypes.byref(pos)))
         return pos.value
 
     def read_idx(self, idx):
-        """Read record with index"""
+        """Read record with index."""
         self.seek(idx)
         return self.read()
 
     def write_idx(self, idx, buf):
-        """Write record with index"""
+        """Write record with index."""
         key = self.key_type(idx)
         pos = self.tell()
         self.write(buf)
@@ -170,17 +166,17 @@ def write_idx(self, idx, buf):
 
 
 IRHeader = namedtuple('HEADER', ['flag', 'label', 'id', 'id2'])
-_IRFormat = 'IfQQ'
-_IRSize = struct.calcsize(_IRFormat)
+_IR_FORMAT = 'IfQQ'
+_IR_SIZE = struct.calcsize(_IR_FORMAT)
 
 def pack(header, s):
-    """pack an string into MXImageRecord
+    """Pack an string into MXImageRecord.
 
     Parameters
     ----------
     header : IRHeader
-        header of the image record.
-        header.label can be a number or an array.
+        Header of the image record.
+        ``header.label`` can be a number or an array.
     s : str
         string to pack
     """
@@ -191,47 +187,47 @@ def pack(header, s):
         label = np.asarray(header.label, dtype=np.float32)
         header = header._replace(flag=label.size, label=0)
         s = label.tostring() + s
-    s = struct.pack(_IRFormat, *header) + s
+    s = struct.pack(_IR_FORMAT, *header) + s
     return s
 
 def unpack(s):
-    """unpack a MXImageRecord to string
+    """Unpack a MXImageRecord to string.
 
     Parameters
     ----------
     s : str
-        string buffer from MXRecordIO.read
+        String buffer from MXRecordIO.read.
 
     Returns
     -------
     header : IRHeader
-        header of the image record
+        Header of the image record.
     s : str
-        unpacked string
+        Unpacked string.
     """
-    header = IRHeader(*struct.unpack(_IRFormat, s[:_IRSize]))
-    s = s[_IRSize:]
+    header = IRHeader(*struct.unpack(_IR_FORMAT, s[:_IR_SIZE]))
+    s = s[_IR_SIZE:]
     if header.flag > 0:
         header = header._replace(label=np.fromstring(s, np.float32, header.flag))
         s = s[header.flag*4:]
     return header, s
 
 def unpack_img(s, iscolor=-1):
-    """unpack a MXImageRecord to image
+    """Unpack a MXImageRecord to image.
 
     Parameters
     ----------
     s : str
-        string buffer from MXRecordIO.read
+        String buffer from ``MXRecordIO.read``.
     iscolor : int
-        image format option for cv2.imdecode
+        image format option for ``cv2.imdecode``.
 
     Returns
     -------
     header : IRHeader
-        header of the image record
+        Header of the image record.
     img : numpy.ndarray
-        unpacked image
+        Unpacked image.
     """
     header, s = unpack(s)
     img = np.fromstring(s, dtype=np.uint8)
@@ -240,24 +236,24 @@ def unpack_img(s, iscolor=-1):
     return header, img
 
 def pack_img(header, img, quality=95, img_fmt='.jpg'):
-    """pack an image into MXImageRecord
+    """Pack an image into ``MXImageRecord``.
 
     Parameters
     ----------
     header : IRHeader
-        header of the image record
-        header.label can be a number or an array.
+        Header of the image record.
+        ``header.label`` can be a number or an array.
     img : numpy.ndarray
         image to pack
     quality : int
-        quality for JPEG encoding. 1-100, or compression for PNG encoding. 1-9.
+        Quality for JPEG encoding in range 1-100, or compression for PNG encoding in range 1-9.
     img_fmt : str
-        Encoding of the image. .jpg for JPEG, .png for PNG.
+        Encoding of the image (.jpg for JPEG, .png for PNG).
 
     Returns
     -------
     s : str
-        The packed string
+        The packed string.
     """
     assert cv2 is not None
     jpg_formats = ['.JPG', '.JPEG']
@@ -269,5 +265,5 @@ def pack_img(header, img, quality=95, img_fmt='.jpg'):
         encode_params = [cv2.IMWRITE_PNG_COMPRESSION, quality]
 
     ret, buf = cv2.imencode(img_fmt, img, encode_params)
-    assert ret, 'failed encoding image'
+    assert ret, 'failed to encode image'
     return pack(header, buf.tostring())
diff --git a/python/mxnet/rnn/io.py b/python/mxnet/rnn/io.py
index 13ed4f89c599..5db86d224943 100644
--- a/python/mxnet/rnn/io.py
+++ b/python/mxnet/rnn/io.py
@@ -48,7 +48,7 @@ def encode_sentences(sentences, vocab=None, invalid_label=-1, invalid_key='\n',
         coded = []
         for word in sent:
             if word not in vocab:
-                assert new_vocab, "Unknow token %s"%word
+                assert new_vocab, "Unknown token %s"%word
                 if idx == invalid_label:
                     idx += 1
                 vocab[word] = idx
@@ -94,13 +94,13 @@ def __init__(self, sentences, batch_size, buckets=None, invalid_label=-1,
 
         ndiscard = 0
         self.data = [[] for _ in buckets]
-        for i in range(len(sentences)):
-            buck = bisect.bisect_left(buckets, len(sentences[i]))
+        for i, sent in enumerate(sentences):
+            buck = bisect.bisect_left(buckets, len(sent))
             if buck == len(buckets):
                 ndiscard += 1
                 continue
             buff = np.full((buckets[buck],), invalid_label, dtype=dtype)
-            buff[:len(sentences[i])] = sentences[i]
+            buff[:len(sent)] = sent
             self.data[buck].append(buff)
 
         self.data = [np.asarray(i, dtype=dtype) for i in self.data]
@@ -162,8 +162,7 @@ def next(self):
             data = self.nddata[i][j:j+self.batch_size]
             label = self.ndlabel[i][j:j+self.batch_size]
 
-        return DataBatch([data], [label],
+        return DataBatch([data], [label], pad=0,
                          bucket_key=self.buckets[i],
                          provide_data=[(self.data_name, data.shape)],
                          provide_label=[(self.label_name, label.shape)])
-
diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py
index 1cb0a123726e..88cb966e8cdd 100644
--- a/python/mxnet/rnn/rnn_cell.py
+++ b/python/mxnet/rnn/rnn_cell.py
@@ -1,13 +1,58 @@
 # coding: utf-8
 # pylint: disable=no-member, invalid-name, protected-access, no-self-use
 # pylint: disable=too-many-branches, too-many-arguments, no-self-use
+# pylint: disable=too-many-lines
 """Definition of various recurrent neural network cells."""
 from __future__ import print_function
 
 import warnings
 
-from .. import symbol, init, ndarray
-from ..base import numeric_types, string_types
+from .. import symbol, init, ndarray, _symbol_internal
+from ..base import string_types, numeric_types
+
+
+def _cells_state_shape(cells):
+    return sum([c.state_shape for c in cells], [])
+
+def _cells_begin_state(cells, **kwargs):
+    return sum([c.begin_state(**kwargs) for c in cells], [])
+
+def _cells_unpack_weights(cells, args):
+    for cell in cells:
+        args = cell.unpack_weights(args)
+    return args
+
+def _cells_pack_weights(cells, args):
+    for cell in cells:
+        args = cell.pack_weights(args)
+    return args
+
+def _normalize_sequence(length, inputs, layout, merge, in_layout=None):
+    assert inputs is not None, \
+        "unroll(inputs=None) has been deprecated. " \
+        "Please create input variables outside unroll."
+
+    axis = layout.find('T')
+    in_axis = in_layout.find('T') if in_layout is not None else axis
+    if isinstance(inputs, symbol.Symbol):
+        if merge is False:
+            assert len(inputs.list_outputs()) == 1, \
+                "unroll doesn't allow grouped symbol as input. Please convert " \
+                "to list with list(inputs) first or let unroll handle splitting."
+            inputs = list(symbol.split(inputs, axis=in_axis, num_outputs=length,
+                                       squeeze_axis=1))
+    else:
+        assert length is None or len(inputs) == length
+        if merge is True:
+            inputs = [symbol.expand_dims(i, axis=axis) for i in inputs]
+            inputs = symbol.Concat(*inputs, dim=axis)
+            in_axis = axis
+
+    if isinstance(inputs, symbol.Symbol) and axis != in_axis:
+        inputs = symbol.swapaxes(inputs, dim0=axis, dim1=in_axis)
+
+    return inputs, axis
+
 
 class RNNParams(object):
     """Container for holding variables.
@@ -59,9 +104,14 @@ def __init__(self, prefix='', params=None):
             self._own_params = False
         self._prefix = prefix
         self._params = params
+        self._modified = False
+
+        self.reset()
+
+    def reset(self):
+        """Reset before re-using the cell for another graph"""
         self._init_counter = -1
         self._counter = -1
-        self._modified = False
 
     def __call__(self, inputs, states):
         """Construct symbol for one step of RNN.
@@ -93,16 +143,23 @@ def state_shape(self):
         """shape(s) of states"""
         raise NotImplementedError()
 
-    def begin_state(self, init_sym=symbol.zeros, **kwargs):
+    @property
+    def _gate_names(self):
+        """name(s) of gates"""
+        return ()
+
+    def begin_state(self, func=symbol.zeros, **kwargs):
         """Initial state for this cell.
 
         Parameters
         ----------
-        init_sym : Symbol, default symbol.zeros
-            Symbol for generating initial state. Can be zeros,
-            ones, uniform, normal, etc.
+        func : callable, default symbol.zeros
+            Function for creating initial state. Can be symbol.zeros,
+            symbol.uniform, symbol.Variable etc.
+            Use symbol.Variable if you want to directly
+            feed input as states.
         **kwargs :
-            more keyword arguments passed to init_sym. For example
+            more keyword arguments passed to func. For example
             mean, std, dtype, etc.
 
         Returns
@@ -113,19 +170,17 @@ def begin_state(self, init_sym=symbol.zeros, **kwargs):
         assert not self._modified, \
             "After applying modifier cells (e.g. DropoutCell) the base " \
             "cell cannot be called directly. Call the modifier cell instead."
-        state_shape = self.state_shape
-        def recursive(shape):
-            """Recursively construct input states"""
-            if isinstance(shape, tuple):
-                assert len(shape) == 0 or isinstance(shape[0], numeric_types)
-                self._init_counter += 1
-                return init_sym(name='%sinit_%d'%(self._prefix, self._init_counter),
-                                shape=shape, **kwargs)
+        states = []
+        for shape in self.state_shape:
+            self._init_counter += 1
+            if shape is None:
+                state = func(name='%sbegin_state_%d'%(self._prefix, self._init_counter),
+                             **kwargs)
             else:
-                assert isinstance(shape, list)
-                return [recursive(i) for i in shape]
-
-        return recursive(state_shape)
+                state = func(name='%sbegin_state_%d'%(self._prefix, self._init_counter),
+                             shape=shape, **kwargs)
+            states.append(state)
+        return states
 
     def unpack_weights(self, args):
         """Unpack fused weight matrices into separate
@@ -143,8 +198,19 @@ def unpack_weights(self, args):
             dictionary with weights associated to
             this cell unpacked.
         """
-        #pylint: disable=R0201
-        return args.copy()
+        args = args.copy()
+        if not self._gate_names:
+            return args
+        h = self._num_hidden
+        for group_name in ['i2h', 'h2h']:
+            weight = args.pop('%s%s_weight'%(self._prefix, group_name))
+            bias = args.pop('%s%s_bias' % (self._prefix, group_name))
+            for j, gate in enumerate(self._gate_names):
+                wname = '%s%s%s_weight' % (self._prefix, group_name, gate)
+                args[wname] = weight[j*h:(j+1)*h].copy()
+                bname = '%s%s%s_bias' % (self._prefix, group_name, gate)
+                args[bname] = bias[j*h:(j+1)*h].copy()
+        return args
 
     def pack_weights(self, args):
         """Pack separate weight matrices into fused
@@ -161,11 +227,22 @@ def pack_weights(self, args):
             dictionary with weights associated to
             this cell packed.
         """
-        #pylint: disable=R0201
-        return args.copy()
+        args = args.copy()
+        if not self._gate_names:
+            return args
+        for group_name in ['i2h', 'h2h']:
+            weight = []
+            bias = []
+            for gate in self._gate_names:
+                wname = '%s%s%s_weight'%(self._prefix, group_name, gate)
+                weight.append(args.pop(wname))
+                bname = '%s%s%s_bias'%(self._prefix, group_name, gate)
+                bias.append(args.pop(bname))
+            args['%s%s_weight'%(self._prefix, group_name)] = ndarray.concatenate(weight)
+            args['%s%s_bias'%(self._prefix, group_name)] = ndarray.concatenate(bias)
+        return args
 
-    def unroll(self, length, inputs=None, begin_state=None,
-               input_prefix='', layout='NTC', merge_outputs=False):
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
         """Unroll an RNN cell across time steps.
 
         Parameters
@@ -181,25 +258,20 @@ def unroll(self, length, inputs=None, begin_state=None,
             If inputs is a list of symbols (usually output of
             previous unroll), they should all have shape
             (batch_size, ...).
-
-            If inputs is None, Placeholder variables are
-            automatically created.
         begin_state : nested list of Symbol
             input states. Created by begin_state()
             or output state of another cell. Created
             from begin_state() if None.
-        input_prefix : str
-            prefix for automatically created input
-            placehodlers.
         layout : str
             layout of input symbol. Only used if inputs
             is a single Symbol.
         merge_outputs : bool
-            if False, return outputs as a list of Symbols.
+            If False, return outputs as a list of Symbols.
             If True, concatenate output across time steps
             and return a single symbol with shape
             (batch_size, length, ...) if layout == 'NTC',
             or (length, batch_size, ...) if layout == 'TNC'.
+            If None, output whatever is faster
 
         Returns
         -------
@@ -208,18 +280,9 @@ def unroll(self, length, inputs=None, begin_state=None,
         states : Symbol or nested list of Symbol
             has the same structure as begin_state()
         """
-        axis = layout.find('T')
-        if inputs is None:
-            inputs = [symbol.Variable('%st%d_data'%(input_prefix, i))
-                      for i in range(length)]
-        elif isinstance(inputs, symbol.Symbol):
-            assert len(inputs.list_outputs()) == 1, \
-                "unroll doesn't allow grouped symbol as input. Please " \
-                "convert to list first or let unroll handle slicing"
-            inputs = symbol.SliceChannel(inputs, axis=axis, num_outputs=length,
-                                         squeeze_axis=1)
-        else:
-            assert len(inputs) == length
+        self.reset()
+
+        inputs, _ = _normalize_sequence(length, inputs, layout, False)
         if begin_state is None:
             begin_state = self.begin_state()
 
@@ -229,9 +292,8 @@ def unroll(self, length, inputs=None, begin_state=None,
             output, states = self(inputs[i], states)
             outputs.append(output)
 
-        if merge_outputs:
-            outputs = [symbol.expand_dims(i, axis=axis) for i in outputs]
-            outputs = symbol.Concat(*outputs, dim=axis)
+        outputs, _ = _normalize_sequence(length, outputs, layout, merge_outputs)
+
         return outputs, states
 
     #pylint: disable=no-self-use
@@ -270,38 +332,25 @@ def __init__(self, num_hidden, activation='tanh', prefix='rnn_', params=None):
 
     @property
     def state_shape(self):
-        """shape(s) of states"""
-        return (0, self._num_hidden)
-
-    def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
+        return [(0, self._num_hidden)]
 
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
+    @property
+    def _gate_names(self):
+        return ('',)
 
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
+    def __call__(self, inputs, states):
         self._counter += 1
         name = '%st%d_'%(self._prefix, self._counter)
         i2h = symbol.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
                                     num_hidden=self._num_hidden,
                                     name='%si2h'%name)
-        h2h = symbol.FullyConnected(data=states, weight=self._hW, bias=self._hB,
+        h2h = symbol.FullyConnected(data=states[0], weight=self._hW, bias=self._hB,
                                     num_hidden=self._num_hidden,
                                     name='%sh2h'%name)
         output = self._get_activation(i2h + h2h, self._activation,
                                       name='%sout'%name)
 
-        return output, output
+        return output, [output]
 
 
 class LSTMCell(BaseRNNCell):
@@ -317,95 +366,28 @@ class LSTMCell(BaseRNNCell):
     params : RNNParams or None
         container for weight sharing between cells.
         created if None.
+    forget_bias : bias added to forget gate, default 1.0.
+        Jozefowicz et al. 2015 recommends setting this to 1.0
     """
-    def __init__(self, num_hidden, prefix='lstm_', params=None):
+    def __init__(self, num_hidden, prefix='lstm_', params=None, forget_bias=1.0):
         super(LSTMCell, self).__init__(prefix=prefix, params=params)
+
         self._num_hidden = num_hidden
         self._iW = self.params.get('i2h_weight')
-        self._iB = self.params.get('i2h_bias')
         self._hW = self.params.get('h2h_weight')
+        # we add the forget_bias to i2h_bias, this adds the bias to the forget gate activation
+        self._iB = self.params.get('i2h_bias', init=init.LSTMBias(forget_bias=forget_bias))
         self._hB = self.params.get('h2h_bias')
 
     @property
     def state_shape(self):
-        """shape(s) of states"""
         return [(0, self._num_hidden), (0, self._num_hidden)]
 
-    def unpack_weights(self, args):
-        """Unpack fused weight matrices into separate
-        weight matrices
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing packed weights.
-            usually from Module.get_output()
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell unpacked.
-        """
-        args = args.copy()
-        outs = ['_i', '_f', '_c', '_o']
-        h = self._num_hidden
-        for i in ['i2h', 'h2h']:
-            weight = args.pop('%s%s_weight'%(self._prefix, i))
-            bias = args.pop('%s%s_bias'%(self._prefix, i))
-            for j, name in enumerate(outs):
-                wname = '%s%s%s_weight'%(self._prefix, i, name)
-                args[wname] = weight[j*h:(j+1)*h].copy()
-                bname = '%s%s%s_bias'%(self._prefix, i, name)
-                args[bname] = bias[j*h:(j+1)*h].copy()
-        return args
-
-    def pack_weights(self, args):
-        """Pack separate weight matrices into fused
-        weight.
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing unpacked weights.
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell packed.
-        """
-        args = args.copy()
-        outs = ['_i', '_f', '_c', '_o']
-        for i in ['i2h', 'h2h']:
-            weight = []
-            bias = []
-            for name in outs:
-                wname = '%s%s%s_weight'%(self._prefix, i, name)
-                weight.append(args.pop(wname))
-                bname = '%s%s%s_bias'%(self._prefix, i, name)
-                bias.append(args.pop(bname))
-            args['%s%s_weight'%(self._prefix, i)] = ndarray.concatenate(weight)
-            args['%s%s_bias'%(self._prefix, i)] = ndarray.concatenate(bias)
-        return args
+    @property
+    def _gate_names(self):
+        return ['_i', '_f', '_c', '_o']
 
     def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
-
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
-
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
         self._counter += 1
         name = '%st%d_'%(self._prefix, self._counter)
         i2h = symbol.FullyConnected(data=inputs, weight=self._iW, bias=self._iB,
@@ -433,6 +415,74 @@ def __call__(self, inputs, states):
         return next_h, [next_h, next_c]
 
 
+class GRUCell(BaseRNNCell):
+    """Gated Rectified Unit (GRU) network cell.
+    Note: this is an implementation of the cuDNN version of GRUs
+    (slight modification compared to Cho et al. 2014).
+
+    Parameters
+    ----------
+    num_hidden : int
+        number of units in output symbol
+    prefix : str, default 'gru_'
+        prefix for name of layers
+        (and name of weight if params is None)
+    params : RNNParams or None
+        container for weight sharing between cells.
+        created if None.
+    """
+    def __init__(self, num_hidden, prefix='gru_', params=None):
+        super(GRUCell, self).__init__(prefix=prefix, params=params)
+        self._num_hidden = num_hidden
+        self._iW = self.params.get("i2h_weight")
+        self._iB = self.params.get("i2h_bias")
+        self._hW = self.params.get("h2h_weight")
+        self._hB = self.params.get("h2h_bias")
+
+    @property
+    def state_shape(self):
+        return [(0, self._num_hidden)]
+
+    @property
+    def _gate_names(self):
+        return ['_r', '_z', '_o']
+
+    def __call__(self, inputs, states):
+        # pylint: disable=too-many-locals
+        self._counter += 1
+
+        seq_idx = self._counter
+        name = '%st%d_' % (self._prefix, seq_idx)
+        prev_state_h = states[0]
+
+        i2h = symbol.FullyConnected(data=inputs,
+                                    weight=self._iW,
+                                    bias=self._iB,
+                                    num_hidden=self._num_hidden * 3,
+                                    name="%s_i2h" % name)
+        h2h = symbol.FullyConnected(data=prev_state_h,
+                                    weight=self._hW,
+                                    bias=self._hB,
+                                    num_hidden=self._num_hidden * 3,
+                                    name="%s_h2h" % name)
+
+        i2h_r, i2h_z, i2h = symbol.SliceChannel(i2h, num_outputs=3, name="%s_i2h_slice" % name)
+        h2h_r, h2h_z, h2h = symbol.SliceChannel(h2h, num_outputs=3, name="%s_h2h_slice" % name)
+
+        reset_gate = symbol.Activation(i2h_r + h2h_r, act_type="sigmoid",
+                                       name="%s_r_act" % name)
+        update_gate = symbol.Activation(i2h_z + h2h_z, act_type="sigmoid",
+                                        name="%s_z_act" % name)
+
+        next_h_tmp = symbol.Activation(i2h + reset_gate * h2h, act_type="tanh",
+                                       name="%s_h_act" % name)
+
+        next_h = symbol._internal._plus((1. - update_gate) * next_h_tmp, update_gate * prev_state_h,
+                                        name='%sout' % name)
+
+        return next_h, [next_h]
+
+
 class FusedRNNCell(BaseRNNCell):
     """Fusing RNN layers across time step into one kernel.
     Improves speed but is less flexible. Currently only
@@ -442,7 +492,7 @@ class FusedRNNCell(BaseRNNCell):
     ----------
     """
     def __init__(self, num_hidden, num_layers=1, mode='lstm', bidirectional=False,
-                 dropout=0., get_next_state=False, initializer=None,
+                 dropout=0., get_next_state=False, forget_bias=1.0,
                  prefix=None, params=None):
         if prefix is None:
             prefix = '%s_'%mode
@@ -453,64 +503,62 @@ def __init__(self, num_hidden, num_layers=1, mode='lstm', bidirectional=False,
         self._bidirectional = bidirectional
         self._dropout = dropout
         self._get_next_state = get_next_state
-        if initializer is None:
-            initializer = init.Xavier(factor_type='in', magnitude=2.34)
-        if not isinstance(initializer, init.FusedRNN):
-            initializer = init.FusedRNN(initializer, num_hidden, num_layers,
-                                        mode, bidirectional)
-        self._parameter = self.params.get('parameters', init=initializer)
+        self._directions = ['l', 'r'] if bidirectional else ['l']
 
-        self._directions = self._bidirectional + 1
-        self._weight_names = {'rnn_relu': [''],
-                              'rnn_tanh': [''],
-                              'lstm': ['_i', '_f', '_c', '_o'],
-                              'gru': ['_r', '_z', '_o']}[self._mode]
-        self._num_weights = len(self._weight_names)
+        initializer = init.FusedRNN(None, num_hidden, num_layers, mode,
+                                    bidirectional, forget_bias)
+        self._parameter = self.params.get('parameters', init=initializer)
 
     @property
     def state_shape(self):
-        """shape(s) of states"""
         b = self._bidirectional + 1
-        if self._mode == 'lstm':
-            return [(b*self._num_layers, 0, self._num_hidden),
-                    (b*self._num_layers, 0, self._num_hidden)]
-        else:
-            return (b*self._num_layers, 0, self._num_hidden)
+        n = (self._mode == 'lstm') + 1
+        return [(b*self._num_layers, 0, self._num_hidden)]*n
+
+    @property
+    def _gate_names(self):
+        return {'rnn_relu': [''],
+                'rnn_tanh': [''],
+                'lstm': ['_i', '_f', '_c', '_o'],
+                'gru': ['_r', '_z', '_o']}[self._mode]
+
+    @property
+    def _num_gates(self):
+        return len(self._gate_names)
 
     def _slice_weights(self, arr, li, lh):
         """slice fused rnn weights"""
         args = {}
-        b = self._directions
-        m = self._num_weights
-        c = self._weight_names
-        d = ['l', 'r']
+        gate_names = self._gate_names
+        directions = self._directions
 
+        b = len(directions)
         p = 0
-        for i in range(self._num_layers):
-            for j in range(b):
-                for k in range(m):
-                    name = '%s%s%d_i2h%s_weight'%(self._prefix, d[j], i, c[k])
-                    if i > 0:
+        for layer in range(self._num_layers):
+            for direction in directions:
+                for gate in gate_names:
+                    name = '%s%s%d_i2h%s_weight'%(self._prefix, direction, layer, gate)
+                    if layer > 0:
                         size = b*lh*lh
                         args[name] = arr[p:p+size].reshape((lh, b*lh))
                     else:
                         size = li*lh
                         args[name] = arr[p:p+size].reshape((lh, li))
                     p += size
-                for k in range(m):
-                    name = '%s%s%d_h2h%s_weight'%(self._prefix, d[j], i, c[k])
+                for gate in gate_names:
+                    name = '%s%s%d_h2h%s_weight'%(self._prefix, direction, layer, gate)
                     size = lh**2
                     args[name] = arr[p:p+size].reshape((lh, lh))
                     p += size
 
-        for i in range(self._num_layers):
-            for j in range(b):
-                for k in range(m):
-                    name = '%s%s%d_i2h%s_bias'%(self._prefix, d[j], i, c[k])
+        for layer in range(self._num_layers):
+            for direction in directions:
+                for gate in gate_names:
+                    name = '%s%s%d_i2h%s_bias'%(self._prefix, direction, layer, gate)
                     args[name] = arr[p:p+lh]
                     p += lh
-                for k in range(m):
-                    name = '%s%s%d_h2h%s_bias'%(self._prefix, d[j], i, c[k])
+                for gate in gate_names:
+                    name = '%s%s%d_h2h%s_bias'%(self._prefix, direction, layer, gate)
                     args[name] = arr[p:p+lh]
                     p += lh
 
@@ -518,25 +566,10 @@ def _slice_weights(self, arr, li, lh):
         return args
 
     def unpack_weights(self, args):
-        """Unpack fused weight matrices into separate
-        weight matrices
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing packed weights.
-            usually from Module.get_output()
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell unpacked.
-        """
         args = args.copy()
         arr = args.pop(self._parameter.name)
-        b = self._directions
-        m = self._num_weights
+        b = len(self._directions)
+        m = self._num_gates
         h = self._num_hidden
         num_input = arr.size//b//h//m - (self._num_layers - 1)*(h+b*h+2) - h - 2
 
@@ -545,24 +578,10 @@ def unpack_weights(self, args):
         return args
 
     def pack_weights(self, args):
-        """Pack separate weight matrices into fused
-        weight.
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing unpacked weights.
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell packed.
-        """
         args = args.copy()
         b = self._bidirectional + 1
-        m = self._num_weights
-        c = self._weight_names
+        m = self._num_gates
+        c = self._gate_names
         h = self._num_hidden
         w0 = args['%sl0_i2h%s_weight'%(self._prefix, c[0])]
         num_input = w0.shape[1]
@@ -577,70 +596,24 @@ def pack_weights(self, args):
     def __call__(self, inputs, states):
         raise NotImplementedError("FusedRNNCell cannot be stepped. Please use unroll")
 
-    def unroll(self, length, inputs=None, begin_state=None,
-               input_prefix='', layout='NTC', merge_outputs=False):
-        """Unroll an RNN cell across time steps.
-
-        Parameters
-        ----------
-        length : int
-            number of steps to unroll
-        inputs : Symbol, list of Symbol, or None
-            if inputs is a single Symbol (usually the output
-            of Embedding symbol), it should have shape
-            (batch_size, length, ...) if layout == 'NTC',
-            or (length, batch_size, ...) if layout == 'TNC'.
-            using 'TNC' is more efficient for FusedRNNCell.
-
-            If inputs is a list of symbols (usually output of
-            previous unroll), they should all have shape
-            (batch_size, ...). using single symbol is
-            more efficient for FusedRNNCell.
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
+        self.reset()
 
-            If inputs is None, a single placeholder variable is
-            automatically created.
-        begin_state : nested list of Symbol
-            input states. Created by begin_state()
-            or output state of another cell. Created
-            from begin_state() if None.
-        input_prefix : str
-            prefix for automatically created input
-            placehodlers.
-        layout : str
-            layout of input/output symbol.
-
-        Returns
-        -------
-        outputs : list of Symbol
-            output symbols.
-        states : Symbol or nested list of Symbol
-            has the same structure as begin_state()
-        """
-        axis = layout.find('T')
-        if inputs is None:
-            inputs = symbol.Variable('%sdata'%input_prefix)
-        if isinstance(inputs, symbol.Symbol):
-            assert len(inputs.list_outputs()) == 1, \
-                "unroll doesn't allow grouped symbol as input. Please " \
-                "convert to list first or let unroll handle slicing"
-            if axis == 1:
-                warnings.warn("NTC layout detected. Consider using "
-                              "TNC for FusedRNNCell for faster speed")
-                inputs = symbol.SwapAxis(inputs, dim1=0, dim2=1)
-            else:
-                assert axis == 0, "Unsupported layout %s"%layout
+        inputs, axis = _normalize_sequence(length, inputs, layout, True)
+        if axis == 1:
+            warnings.warn("NTC layout detected. Consider using "
+                          "TNC for FusedRNNCell for faster speed")
+            inputs = symbol.swapaxes(inputs, dim1=0, dim2=1)
         else:
-            assert len(inputs) == length
-            inputs = [symbol.expand_dims(i, axis=0) for i in inputs]
-            inputs = symbol.Concat(inputs, dim=0)
+            assert axis == 0, "Unsupported layout %s"%layout
         if begin_state is None:
             begin_state = self.begin_state()
 
         states = begin_state
         if self._mode == 'lstm':
-            states = {'state': states[0], 'state_cell': states[1]}
+            states = {'state': states[0], 'state_cell': states[1]} # pylint: disable=redefined-variable-type
         else:
-            states = {'state': states}
+            states = {'state': states[0]}
 
         rnn = symbol.RNN(data=inputs, parameters=self._parameter,
                          state_size=self._num_hidden, num_layers=self._num_layers,
@@ -654,18 +627,48 @@ def unroll(self, length, inputs=None, begin_state=None,
         elif self._mode == 'lstm':
             outputs, states = rnn[0], [rnn[1], rnn[2]]
         else:
-            outputs, states = rnn[0], rnn[1]
+            outputs, states = rnn[0], [rnn[1]]
 
-        if not merge_outputs:
-            warnings.warn("Call FusedRNNCell.unroll with merge_outputs=True "
-                          "for faster speed")
-            outputs = list(symbol.SliceChannel(outputs, aixs=axis, num_outputs=length,
-                                               squeeze_axis=1))
-        elif axis == 1:
-            outputs = symbol.SwapAxis(outputs, dim1=0, dim2=1)
+        if axis == 1:
+            outputs = symbol.swapaxes(outputs, dim1=0, dim2=1)
+
+        outputs, _ = _normalize_sequence(length, outputs, layout, merge_outputs)
 
         return outputs, states
 
+    def unfuse(self):
+        """Unfuse the fused RNN in to a stack of rnn cells.
+
+        Returns
+        -------
+        cell : SequentialRNNCell
+            unfused cell that can be used for stepping, and can run on CPU.
+        """
+        stack = SequentialRNNCell()
+        get_cell = {'rnn_relu': lambda cell_prefix: RNNCell(self._num_hidden,
+                                                            activation='relu',
+                                                            prefix=cell_prefix),
+                    'rnn_tanh': lambda cell_prefix: RNNCell(self._num_hidden,
+                                                            activation='tanh',
+                                                            prefix=cell_prefix),
+                    'lstm': lambda cell_prefix: LSTMCell(self._num_hidden,
+                                                         prefix=cell_prefix),
+                    'gru': lambda cell_prefix: GRUCell(self._num_hidden,
+                                                       prefix=cell_prefix)}[self._mode]
+        for i in range(self._num_layers):
+            if self._bidirectional:
+                stack.add(BidirectionalCell(
+                    get_cell('%sl%d_'%(self._prefix, i)),
+                    get_cell('%sr%d_'%(self._prefix, i)),
+                    output_prefix='%sbi_l%d_'%(self._prefix, i)))
+            else:
+                stack.add(get_cell('%sl%d_'%(self._prefix, i)))
+
+            if self._dropout > 0 and i != self._num_layers - 1:
+                stack.add(DropoutCell(self._dropout, prefix='%s_dropout%d_'%(self._prefix, i)))
+
+        return stack
+
 
 class SequentialRNNCell(BaseRNNCell):
     """Sequantially stacking multiple RNN cells
@@ -698,69 +701,91 @@ def add(self, cell):
 
     @property
     def state_shape(self):
-        """shape(s) of states"""
-        return [c.state_shape for c in self._cells]
+        return _cells_state_shape(self._cells)
 
     def begin_state(self, **kwargs):
-        """Initial state for this cell.
-
-        Parameters
-        ----------
-        init_sym : Symbol, default symbol.zeros
-            Symbol for generating initial state. Can be zeros,
-            ones, uniform, normal, etc.
-        **kwargs :
-            more keyword arguments passed to init_sym. For example
-            mean, std, dtype, etc.
-
-        Returns
-        -------
-        states : nested list of Symbol
-            starting states for first RNN step
-        """
         assert not self._modified, \
-            "After applying modifier cells (e.g. DropoutCell) the base " \
+            "After applying modifier cells (e.g. ZoneoutCell) the base " \
             "cell cannot be called directly. Call the modifier cell instead."
-        return [c.begin_state(**kwargs) for c in self._cells]
+        return _cells_begin_state(self._cells, **kwargs)
 
     def unpack_weights(self, args):
-        for cell in self._cells:
-            args = cell.unpack_weights(args)
-        return args
+        return _cells_unpack_weights(self._cells, args)
 
     def pack_weights(self, args):
-        for cell in self._cells:
-            args = cell.pack_weights(args)
-        return args
+        return _cells_pack_weights(self._cells, args)
 
     def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
-
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
-
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
         self._counter += 1
         next_states = []
-        for cell, state in zip(self._cells, states):
+        p = 0
+        for cell in self._cells:
+            assert not isinstance(cell, BidirectionalCell)
+            n = len(cell.state_shape)
+            state = states[p:p+n]
+            p += n
             inputs, state = cell(inputs, state)
             next_states.append(state)
+        return inputs, sum(next_states, [])
+
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
+        self.reset()
+
+        num_cells = len(self._cells)
+        if begin_state is None:
+            begin_state = self.begin_state()
+
+        p = 0
+        next_states = []
+        for i, cell in enumerate(self._cells):
+            n = len(cell.state_shape)
+            states = begin_state[p:p+n]
+            p += n
+            inputs, states = cell.unroll(length, inputs=inputs, begin_state=states, layout=layout,
+                                         merge_outputs=None if i < num_cells-1 else merge_outputs)
+            next_states.extend(states)
+
         return inputs, next_states
 
+
+class DropoutCell(BaseRNNCell):
+    """Apply dropout on input.
+
+    Parameters
+    ----------
+    dropout : float
+        percentage of elements to drop out, which
+        is 1 - percentage to retain.
+    """
+    def __init__(self, dropout, prefix='dropout_', params=None):
+        super(DropoutCell, self).__init__(prefix, params)
+        assert isinstance(dropout, numeric_types), "dropout probability must be a number"
+        self.dropout = dropout
+
+    @property
+    def state_shape(self):
+        return []
+
+    def __call__(self, inputs, states):
+        if self.dropout > 0:
+            inputs = symbol.Dropout(data=inputs, p=self.dropout)
+        return inputs, states
+
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
+        self.reset()
+        inputs, _ = _normalize_sequence(length, inputs, layout, merge_outputs)
+        if isinstance(inputs, symbol.Symbol):
+            return self(inputs, [])
+        else:
+            return super(DropoutCell, self).unroll(
+                length, inputs, begin_state=begin_state, layout=layout,
+                merge_outputs=merge_outputs)
+
+
 class ModifierCell(BaseRNNCell):
     """Base class for modifier cells. A modifier
     cell takes a base cell, apply modifications
-    on it (e.g. Dropout), and returns a new cell.
+    on it (e.g. Zoneout), and returns a new cell.
 
     After applying modifiers the base cell should
     no longer be called directly. The modifer cell
@@ -773,32 +798,14 @@ def __init__(self, base_cell):
 
     @property
     def params(self):
-        """Parameters of this cell"""
         self._own_params = False
         return self.base_cell.params
 
     @property
     def state_shape(self):
-        """shape(s) of states"""
         return self.base_cell.state_shape
 
     def begin_state(self, init_sym=symbol.zeros, **kwargs):
-        """Initial state for this cell.
-
-        Parameters
-        ----------
-        init_sym : Symbol, default symbol.zeros
-            Symbol for generating initial state. Can be zeros,
-            ones, uniform, normal, etc.
-        **kwargs :
-            more keyword arguments passed to init_sym. For example
-            mean, std, dtype, etc.
-
-        Returns
-        -------
-        states : nested list of Symbol
-            starting states for first RNN step
-        """
         assert not self._modified, \
             "After applying modifier cells (e.g. DropoutCell) the base " \
             "cell cannot be called directly. Call the modifier cell instead."
@@ -808,116 +815,136 @@ def begin_state(self, init_sym=symbol.zeros, **kwargs):
         return begin
 
     def unpack_weights(self, args):
-        """Unpack fused weight matrices into separate
-        weight matrices
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing packed weights.
-            usually from Module.get_output()
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell unpacked.
-        """
         return self.base_cell.unpack_weights(args)
 
     def pack_weights(self, args):
-        """Pack separate weight matrices into fused
-        weight.
-
-        Parameters
-        ----------
-        args : dict of str -> NDArray
-            dictionary containing unpacked weights.
-
-        Returns
-        -------
-        args : dict of str -> NDArray
-            dictionary with weights associated to
-            this cell packed.
-        """
         return self.base_cell.pack_weights(args)
 
     def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
-
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
-
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
         raise NotImplementedError
 
 
-class DropoutCell(ModifierCell):
-    """Apply dropout on base cell"""
-    def __init__(self, base_cell, dropout_outputs=0., dropout_states=0.):
-        super(DropoutCell, self).__init__(base_cell)
-        self.dropout_outputs = dropout_outputs
-        self.dropout_states = dropout_states
-
-    def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
-
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
-
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
-        output, states = self.base_cell(inputs, states)
-        if self.dropout_outputs > 0:
-            output = symbol.Dropout(data=output, p=self.dropout_outputs)
-        if self.dropout_states > 0:
-            states = symbol.Dropout(data=states, p=self.dropout_states)
-        return output, states
-
-
 class ZoneoutCell(ModifierCell):
     """Apply Zoneout on base cell"""
     def __init__(self, base_cell, zoneout_outputs=0., zoneout_states=0.):
+        assert not isinstance(base_cell, FusedRNNCell), \
+            "FusedRNNCell doesn't support zoneout. " \
+            "Please unfuse first."
+        assert not isinstance(base_cell, BidirectionalCell), \
+            "BidirectionalCell doesn't support zoneout since it doesn't support step. " \
+            "Please add ZoneoutCell to the cells underneath instead."
+        assert not isinstance(base_cell, SequentialRNNCell) or not base_cell._bidirectional, \
+            "Bidirectional SequentialRNNCell doesn't support zoneout. " \
+            "Please add ZoneoutCell to the cells underneath instead."
         super(ZoneoutCell, self).__init__(base_cell)
         self.zoneout_outputs = zoneout_outputs
         self.zoneout_states = zoneout_states
         self.prev_output = None
 
+    def reset(self):
+        super(ZoneoutCell, self).reset()
+        self.prev_output = None
+
     def __call__(self, inputs, states):
-        """Construct symbol for one step of RNN.
+        cell, p_outputs, p_states = self.base_cell, self.zoneout_outputs, self.zoneout_states
+        next_output, next_states = cell(inputs, states)
+        mask = (lambda p, like:
+                symbol.Dropout(_symbol_internal._identity_with_attr_like_rhs(symbol.ones((0, 0)),
+                                                                             like),
+                               p=p))
 
-        Parameters
-        ----------
-        inputs : sym.Variable
-            input symbol, 2D, batch * num_units
-        states : sym.Variable
-            state from previous step or begin_state().
+        prev_output = self.prev_output if self.prev_output else symbol.zeros((0, 0))
 
-        Returns
-        -------
-        output : Symbol
-            output symbol
-        states : Symbol
-            state to next step of RNN.
-        """
-        raise NotImplementedError
+        output = (symbol.where(mask(p_outputs, next_output), next_output, prev_output)
+                  if p_outputs != 0. else next_output)
+        states = ([symbol.where(mask(p_states, new_s), new_s, old_s) for new_s, old_s in
+                   zip(next_states, states)] if p_states != 0. else next_states)
+
+        self.prev_output = output
+
+        return output, states
+
+
+
+class BidirectionalCell(BaseRNNCell):
+    """Bidirectional RNN cell
+
+    Parameters
+    ----------
+    l_cell : BaseRNNCell
+        cell for forward unrolling
+    r_cell : BaseRNNCell
+        cell for backward unrolling
+    output_prefix : str, default 'bi_'
+        prefix for name of output
+    """
+    def __init__(self, l_cell, r_cell, params=None, output_prefix='bi_'):
+        super(BidirectionalCell, self).__init__('', params=params)
+        self._override_cell_params = params is not None
+        self._cells = [l_cell, r_cell]
+        self._output_prefix = output_prefix
+
+    def unpack_weights(self, args):
+        return _cells_unpack_weights(self._cells, args)
+
+    def pack_weights(self, args):
+        return _cells_pack_weights(self._cells, args)
+
+    def __call__(self, inputs, states):
+        raise NotImplementedError("Bidirectional cannot be stepped. Please use unroll")
+
+    @property
+    def state_shape(self):
+        return _cells_state_shape(self._cells)
+
+    def begin_state(self, **kwargs):
+        assert not self._modified, \
+            "After applying modifier cells (e.g. DropoutCell) the base " \
+            "cell cannot be called directly. Call the modifier cell instead."
+        return _cells_begin_state(self._cells, **kwargs)
 
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
+        self.reset()
+
+        inputs, axis = _normalize_sequence(length, inputs, layout, False)
+        if begin_state is None:
+            begin_state = self.begin_state()
+
+        states = begin_state
+        l_cell, r_cell = self._cells
+        l_outputs, l_states = l_cell.unroll(length, inputs=inputs,
+                                            begin_state=states[:len(l_cell.state_shape)],
+                                            layout=layout, merge_outputs=merge_outputs)
+        r_outputs, r_states = r_cell.unroll(length,
+                                            inputs=list(reversed(inputs)),
+                                            begin_state=states[len(l_cell.state_shape):],
+                                            layout=layout, merge_outputs=merge_outputs)
+
+        if merge_outputs is None:
+            merge_outputs = (isinstance(l_outputs, symbol.Symbol)
+                             and isinstance(r_outputs, symbol.Symbol))
+            if not merge_outputs:
+                if isinstance(l_outputs, symbol.Symbol):
+                    l_outputs = list(symbol.SliceChannel(l_outputs, axis=axis,
+                                                         num_outputs=length, squeeze_axis=1))
+                if isinstance(r_outputs, symbol.Symbol):
+                    r_outputs = list(symbol.SliceChannel(r_outputs, axis=axis,
+                                                         num_outputs=length, squeeze_axis=1))
+
+        if merge_outputs:
+            l_outputs = [l_outputs]
+            r_outputs = [symbol.reverse(r_outputs, axis=axis)]
+        else:
+            r_outputs = list(reversed(r_outputs))
+
+        outputs = [symbol.Concat(l_o, r_o, dim=1+merge_outputs,
+                                 name=('%sout'%(self._output_prefix) if merge_outputs
+                                       else '%st%d'%(self._output_prefix, i)))
+                   for i, l_o, r_o in
+                   zip(range(len(l_outputs)), l_outputs, r_outputs)]
+
+        if merge_outputs:
+            outputs = outputs[0]
+
+        states = [l_states, r_states]
+        return outputs, states
diff --git a/python/mxnet/rtc.py b/python/mxnet/rtc.py
index cdb8085597d3..f249e0284854 100644
--- a/python/mxnet/rtc.py
+++ b/python/mxnet/rtc.py
@@ -6,26 +6,26 @@
 
 class Rtc(object):
     """MXRtc object in mxnet.
-    This class allow you to write cuda kernel in python
-    and call them with NDArray.
+    This class allow you to write a CUDA kernels in Python
+    and call it with NDArray.
 
     Parameters
     ----------
     name : str
-        name of the kernel
+        Name of the kernel.
     inputs : tuple of (str, mxnet.ndarray)
-        list of input names and ndarray
+        List of input names and ndarray.
     outputs : tuple of (str, mxnet.ndarray)
-        list of output names and ndarray
+        List of output names and ndarray.
     kernel : str
-        the actual kernel code.
+        The actual kernel code.
         Note that this is only the body of the kernel, i.e.
         after { and before }. Rtc will decorate the kernel.
-        For example, if name = "mykernel" and
+        For example, if ``name = "mykernel"`` and
         inputs = [('x', mx.nd.zeros((10,)))]
         outputs = [('y', mx.nd.zeros((10,)))]
         kernel = "y[threadIdx.x] = x[threadIdx.x];",
-        the kernel that is compile will be:
+        then the compiled kernel will be:
         extern "C" __global__ mykernel(float *x, float *y) {
             const int x_ndim = 1;
             const int x_dims = { 10 };
@@ -59,20 +59,20 @@ def __del__(self):
         check_call(_LIB.MXRtcFree(self.handle))
 
     def push(self, inputs, outputs, grid_dims, block_dims):
-        """run the kernel.
+        """Run the kernel.
 
         Parameters
         ----------
         inputs : list of ndarray
-            list of input. Can be different ndarray then uses for constructor,
-            but must have the same shape and in the same order.
+            List of inputs. Can contain different ndarrays than those used for the constructor,
+            but its elements must have the same shapes and appear in the same order.
         outputs : list of ndarray
-            list of out. Can be different ndarray then uses for constructor,
-            but must have the same shape and in the same order.
+            List of outputs. Can contain different ndarrays than used for the constructor,
+            but must have the same shapes and appear in the same order.
         grid_dims : tuple of 3 uint
-            grid dimension for kernel launch
+            Grid dimension for kernel launch.
         block_dims : tuple of 3 uint
-            block dimension for kernel launch
+            Block dimension for kernel launch.
         """
         input_nds = ctypes.cast(c_array(NDArrayHandle, [i.handle for i in inputs]),
                                 ctypes.POINTER(NDArrayHandle))
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 2d1744d3aff9..cb3f4c1118a2 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -5,6 +5,7 @@
 from __future__ import absolute_import as _abs
 
 import ctypes
+import warnings
 from numbers import Number
 
 import os as _os
@@ -15,13 +16,13 @@
 from .base import c_array, c_str, mx_uint, py_str, string_types, mx_real_t
 from .base import NDArrayHandle, ExecutorHandle, SymbolHandle
 from .base import check_call, MXNetError
-from .context import Context
+from .context import Context, cpu
 from .ndarray import NDArray, zeros as _nd_zeros, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 from .executor import Executor
 from . import _symbol_internal as _internal
 from .attribute import AttrScope
 
-# Use different verison of SymbolBase
+# Use different version of SymbolBase
 # When possible, use cython to speedup part of computation.
 try:
     if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
@@ -53,6 +54,7 @@ def __iter__(self):
         return (self[i] for i in self.list_outputs())
 
     def __add__(self, other):
+        """x.__add__(y) <=> x+y """
         if isinstance(other, Symbol):
             return _internal._Plus(self, other)
         if isinstance(other, Number):
@@ -64,6 +66,7 @@ def __radd__(self, other):
         return self.__add__(other)
 
     def __sub__(self, other):
+        """x.__sub__(y) <=> x-y """
         if isinstance(other, Symbol):
             return _internal._Minus(self, other)
         if isinstance(other, Number):
@@ -72,12 +75,14 @@ def __sub__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rsub__(self, other):
+        """x.__rsub__(y) <=> y-x """
         if isinstance(other, Number):
             return _internal._RMinusScalar(self, scalar=other)
         else:
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __mul__(self, other):
+        """x.__mul__(y) <=> x*y """
         if isinstance(other, Symbol):
             return _internal._Mul(self, other)
         if isinstance(other, Number):
@@ -89,6 +94,7 @@ def __rmul__(self, other):
         return self.__mul__(other)
 
     def __div__(self, other):
+        """x.__div__(y) <=> x/y """
         if isinstance(other, Symbol):
             return _internal._Div(self, other)
         if isinstance(other, Number):
@@ -97,6 +103,7 @@ def __div__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __rdiv__(self, other):
+        """x.__rdiv__(y) <=> y/x """
         if isinstance(other, Number):
             return _internal._RDivScalar(self, scalar=other)
         else:
@@ -109,6 +116,7 @@ def __rtruediv__(self, other):
         return self.__rdiv__(other)
 
     def __pow__(self, other):
+        """x.__pow__(y) <=> x**y """
         if isinstance(other, Symbol):
             return _internal._Power(self, other)
         if isinstance(other, Number):
@@ -117,6 +125,7 @@ def __pow__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __neg__(self):
+        """x.__neg__(y) <=> -x """
         return self.__mul__(-1.0)
 
     def __copy__(self):
@@ -129,6 +138,7 @@ def __deepcopy__(self, _):
         return Symbol(handle)
 
     def __eq__(self, other):
+        """x.__eq__(y) <=> x==y """
         if isinstance(other, Symbol):
             return _internal._equal(self, other)
         if isinstance(other, numeric_types):
@@ -137,6 +147,7 @@ def __eq__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __ne__(self, other):
+        """x.__ne__(y) <=> x!=y """
         if isinstance(other, Symbol):
             return _internal._not_equal(self, other)
         if isinstance(other, numeric_types):
@@ -145,6 +156,7 @@ def __ne__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __gt__(self, other):
+        """x.__gt__(y) <=> x>y """
         if isinstance(other, Symbol):
             return _internal._greater(self, other)
         if isinstance(other, numeric_types):
@@ -153,6 +165,7 @@ def __gt__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __ge__(self, other):
+        """x.__ge__(y) <=> x>=y """
         if isinstance(other, Symbol):
             return _internal._greater_equal(self, other)
         if isinstance(other, numeric_types):
@@ -161,6 +174,7 @@ def __ge__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __lt__(self, other):
+        """x.__lt__(y) <=> x<y """
         if isinstance(other, Symbol):
             return _internal._lesser(self, other)
         if isinstance(other, numeric_types):
@@ -169,6 +183,7 @@ def __lt__(self, other):
             raise TypeError('type %s not supported' % str(type(other)))
 
     def __le__(self, other):
+        """x.__le__(y) <=> x<=y """
         if isinstance(other, Symbol):
             return _internal._lesser_equal(self, other)
         if isinstance(other, numeric_types):
@@ -195,7 +210,9 @@ def __setstate__(self, state):
             self.handle = None
 
     def __call__(self, *args, **kwargs):
-        """Invoke symbol as function on inputs.
+        """Compose symbol on inputs.
+
+        x.__call__(y, z) <=> x(y,z)
 
         Parameters
         ----------
@@ -254,6 +271,16 @@ def _compose(self, *args, **kwargs):
             self.handle, name, num_args, keys, args))
 
     def __getitem__(self, index):
+        """x.__getitem__(i) <=> x[i]
+
+        Get an output of this symbol
+
+        Parameters
+        ----------
+        index : int or str
+            indexing key
+
+        """
         if isinstance(index, string_types):
             idx = None
             for i, name in enumerate(self.list_outputs()):
@@ -293,17 +320,17 @@ def name(self):
             return None
 
     def attr(self, key):
-        """Get attribute string from the symbol, this function only works for non-grouped symbol.
+        """Get attribute string from the symbol. This function only works for non-grouped symbols.
 
         Parameters
         ----------
         key : str
-            The key to get attribute from.
+            The key corresponding to the desired attribute.
 
         Returns
         -------
         value : str
-            The attribute value of the key, returns None if attribute do not exist.
+            The desired attribute value, returns None if attribute does not exist.
         """
         ret = ctypes.c_char_p()
         success = ctypes.c_int()
@@ -332,13 +359,14 @@ def list_attr(self, recursive=False):
         return {py_str(pairs[i*2]): py_str(pairs[i*2+1]) for i in range(size.value)}
 
     def attr_dict(self):
-        """Recursively get all attributes from the symbol and its childrens
+        """Recursively get all attributes from the symbol and its children.
 
         Returns
         -------
         ret : dict of str to dict
-            Returns a dict whose keys are names of the symbol and its children.
-            Values of the returned dict are dictionaries that map attribute keys to values
+            There is a key in the returned dict for every child with non-empty attribute set.
+            For each symbol, the name of the symbol is its key in the dict
+            and the correspond value is that symbol's attribute list (itself a dictionary).
         """
         size = mx_uint()
         pairs = ctypes.POINTER(ctypes.c_char_p)()
@@ -354,7 +382,13 @@ def attr_dict(self):
         return ret
 
     def _set_attr(self, **kwargs):
-        """Set the attribute of the symbol.
+        """Set an attribute of the symbol.
+
+        For example. A._set_attr(foo="bar") adds the key, value pair `"foo: bar"`
+        to the symbol's attribute dictionary.
+
+        _set_attr can be used to set the name of a symbol, e.g.
+        `a._set_attr(name="a")`
 
         Parameters
         ----------
@@ -368,25 +402,64 @@ def _set_attr(self, **kwargs):
                 self.handle, c_str(key), c_str(str(value))))
 
     def get_internals(self):
-        """Get a new grouped symbol whose output contains all the internal outputs of this symbol.
+        """Get a new grouped symbol sgroup. The output of sgroup is a list of the
+        outputs of all of the internal nodes.
+
+        Consider the following code:
+        >>> a = mxnet.sym.var('a')
+        >>> b = mxnet.sym.var('b')
+        >>> c = a + b
+        >>> d = c.get_internals()
+        >>> d
+        <Symbol Grouped>
+        >>> d.list_outputs()
+        ['a', 'b', '_plus4_output']
 
         Returns
         -------
         sgroup : Symbol
-            The internal of the symbol.
+            A symbol group containing all internal and leaf nodes of the computation graph
+            used to compute the symbol
         """
         handle = SymbolHandle()
         check_call(_LIB.MXSymbolGetInternals(
             self.handle, ctypes.byref(handle)))
         return Symbol(handle=handle)
 
+    def get_children(self):
+        """Get a new grouped symbol whose output contains
+        inputs to output nodes of the original symbol
+
+        Returns
+        -------
+        sgroup : Symbol or None
+            The children of the head node. If the symbol has no
+            inputs None will be returned.
+        """
+        handle = SymbolHandle()
+        check_call(_LIB.MXSymbolGetChildren(
+            self.handle, ctypes.byref(handle)))
+        ret = Symbol(handle=handle)
+        if len(ret.list_outputs()) == 0:
+            return None
+        return ret
+
     def list_arguments(self):
         """List all the arguments in the symbol.
 
+        A
+        ~~~~
+        >>> a = mxnet.sym.var('a')
+        >>> b = mxnet.sym.var('b')
+        >>> c = a + b
+        >>> c.list_arguments
+        ['a', 'b']
+        ~~~~
+
         Returns
         -------
         args : list of string
-            List of all the arguments.
+            List containing the names of all the arguments required to compute the symbol.
         """
         size = ctypes.c_uint()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -401,6 +474,9 @@ def list_outputs(self):
         -------
         returns : list of string
             List of all the outputs.
+            For most symbols, this list contains only the name of this symbol.
+            For symbol groups, this is a list with the names of all symbols
+            in the group.
         """
         size = ctypes.c_uint()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -418,10 +494,10 @@ def list_auxiliary_states(self):
 
         Notes
         -----
-        Auxiliary states are special states of symbols that do not corresponds to an argument,
-        and do not have gradient. But still be useful for the specific operations.
-        A common example of auxiliary state is the moving_mean and moving_variance in BatchNorm.
-        Most operators do not have Auxiliary states.
+        Auxiliary states are special states of symbols that do not correspond to an argument,
+        and are not updated by gradient descent. Common examples of auxiliary states
+        include the ``moving_mean`` and ``moving_variance`` in BatchNorm.
+        Most operators do not have auxiliary states.
         """
         size = ctypes.c_uint()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -430,11 +506,20 @@ def list_auxiliary_states(self):
         return [py_str(sarr[i]) for i in range(size.value)]
 
     def infer_type(self, *args, **kwargs):
-        """Infer the type of outputs and arguments of given known types of arguments.
+        """Given known types for some arguments, infers the type all arguments
+        and all outputs.
 
-        User can either pass in the known types in positional way or keyword argument way.
-        Tuple of Nones is returned if there is not enough information passed in.
-        An error will be raised if there is inconsistency found in the known types passed in.
+        You can pass in the known types in either positional way or keyword argument way.
+        A tuple of Nones is returned if there is not enough information to deduce the missing types.
+        Inconsistencies in the known types will cause an error to be raised.
+
+        Example usage:
+        ----------
+        >>> a = mxnet.sym.var('a')
+        >>> b = mxnet.sym.var('b')
+        >>> c = a + b
+        >>> c.infer_type(a=float32)
+        ([numpy.float32, numpy.float32], [numpy.float32], [])
 
         Parameters
         ----------
@@ -455,7 +540,7 @@ def infer_type(self, *args, **kwargs):
             The order is in the same order as list_outputs()
         aux_types : list of numpy.dtype or None
             List of types of outputs.
-            The order is in the same order as list_auxiliary()
+            The order is in the same order as list_auxiliary_states()
         """
         # pylint: disable=too-many-locals
         if len(args) != 0 and len(kwargs) != 0:
@@ -511,11 +596,21 @@ def infer_type(self, *args, **kwargs):
         # pylint: enable=too-many-locals
 
     def infer_shape(self, *args, **kwargs):
-        """Infer the shape of outputs and arguments of given known shapes of arguments.
+        """Given known shapes for some arguments, infers the shapes of all arguments
+        and all outputs.
+
+        You can pass in the known shapes in either positional way or keyword argument
+        way. A tuple of Nones is returned if there is not enough information to deduce
+        the missing shapes. Inconsistencies in the known shapes will cause an error to
+        be raised.
 
-        User can either pass in the known shapes in positional way or keyword argument way.
-        Tuple of Nones is returned if there is not enough information passed in.
-        An error will be raised if there is inconsistency found in the known shapes passed in.
+        Example usage:
+        ----------
+        >>> a = mxnet.sym.var('a')
+        >>> b = mxnet.sym.var('b')
+        >>> c = a + b
+        >>> c.infer_shape(a=(3,3))
+        ([(3L, 3L), (3L, 3L)], [(3L, 3L)], [])
 
         Parameters
         ----------
@@ -536,10 +631,26 @@ def infer_shape(self, *args, **kwargs):
             The order is in the same order as list_outputs()
         aux_shapes : list of tuple or None
             List of shapes of outputs.
-            The order is in the same order as list_auxiliary()
+            The order is in the same order as list_auxiliary_states()
         """
         try:
-            return self._infer_shape_impl(False, *args, **kwargs)
+            res = self._infer_shape_impl(False, *args, **kwargs)
+            if res[1] is None:
+                arg_shapes, _, _ = self._infer_shape_impl(True, *args, **kwargs)
+                arg_names = self.list_arguments()
+                unknowns = []
+                for name, shape in zip(arg_names, arg_shapes):
+                    if not shape or not _numpy.prod(shape):
+                        if len(unknowns) >= 10:
+                            unknowns.append('...')
+                            break
+                        unknowns.append('%s: %s'%(name, str(shape)))
+                warnings.warn(
+                    "Cannot decide shape for the following arguments " +
+                    "(0s in shape means unknown dimensions). " +
+                    "Consider providing them as input:\n\t" +
+                    "\n\t".join(unknowns), stacklevel=2)
+            return res
         except MXNetError:
             print("infer_shape error. Arguments:")
             for i, arg in enumerate(args):
@@ -567,7 +678,7 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             for s in args:
                 if s is not None:
                     if not isinstance(s, tuple):
-                        raise TypeError('Argument need to be shapes(tuple)')
+                        raise TypeError('Arguments must be shapes (tuple)')
                     sdata.extend(s)
                 indptr.append(len(sdata))
         else:
@@ -699,10 +810,10 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
         arg_arrays = []
         if isinstance(args, list):
             if len(args) != len(arg_names):
-                raise ValueError('Length of %s do not match number of arguments' % arg_key)
+                raise ValueError('Length of %s does not match the number of arguments' % arg_key)
             for narr in args:
                 if not isinstance(narr, NDArray):
-                    raise TypeError('Only Accept list of NDArrays or dict of str to NDArray')
+                    raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
                 arg_handles.append(narr.handle)
             arg_arrays = args
         elif isinstance(args, dict):
@@ -710,7 +821,7 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
                 if name in args:
                     narr = args[name]
                     if not isinstance(narr, NDArray):
-                        raise TypeError('Only Accept list of NDArrays or dict of str to NDArray')
+                        raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
                     arg_handles.append(narr.handle)
                     arg_arrays.append(narr)
                 else:
@@ -720,7 +831,7 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
                     else:
                         raise ValueError('key `%s` is missing in `%s`' % (name, arg_key))
         else:
-            raise TypeError('Only Accept list of NDArrays or dict of str to NDArray')
+            raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
         return c_array(NDArrayHandle, arg_handles), arg_arrays
 
     def simple_bind(self, ctx,
@@ -963,8 +1074,41 @@ def grad(self, wrt):
         return Symbol(handle)
     # pylint: enable= no-member
 
+    def eval(self, ctx=cpu(), **kwargs):
+        """Evaluate a symbol given arguments
+
+        The `eval` method combines a call to `bind` (which returns an executor)
+        with a call to `forward` (executor method).
+        For the common use case, where you might repeatedly evaluate with same arguments,
+        eval is slow.
+        In that case, you should call `bind` once and then repeatedly call forward.
+        Eval allows simpler syntax for less cumbersome introspection.
+
+        Parameters
+        ----------
+        ctx : Context
+            The device context the generated executor to run on.
+
+        kwargs : list of NDArray or dict of str to NDArray
+            Input arguments to the symbol.
 
-def Variable(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None):
+            - If type is list of NDArray, the position is in the same order of list_arguments.
+            - If type is dict of str to NDArray, then it maps the name of arguments
+              to the corresponding NDArray.
+            - In either case, all the arguments must be provided.
+
+        Returns
+        ----------
+        result :  a list of NDArrays corresponding to the values
+        taken by each symbol when evaluated on given args.
+        When called on a single symbol (not a group),
+        the result will be a list with one element.
+        """
+        return self.bind(ctx, kwargs).forward()
+
+
+
+def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None):
     """Create a symbolic variable with specified name.
 
     Parameters
@@ -974,22 +1118,22 @@ def Variable(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None
     attr : dict of string -> string
         Additional attributes to set on the variable.
     shape : tuple
-        Optionally, one can specify the shape of a variable. This will be used during
-        shape inference. If user specified a different shape for this variable using
-        keyword argument when calling shape inference, this shape information will be ignored.
+        The shape of a variable. If specified, this will be used during shape inference.
+        If the user specified a different shape for this variable using
+        a keyword argument when calling shape inference, this shape information will be ignored.
     lr_mult : float
-        Specify learning rate muliplier for this variable.
+        The learning rate muliplier for this variable.
     wd_mult : float
-        Specify weight decay muliplier for this variable.
+        Weight decay muliplier for this variable.
     dtype : str or numpy.dtype
-        Similar to shape, we can specify dtype for this variable.
+        The dtype for this variable. If not specified, this value will be inferred.
     init : initializer (mxnet.init.*)
-        Specify initializer for this variable to override the default initializer
+        Initializer for this variable to (optionally) override the default initializer
 
     Returns
     -------
     variable : Symbol
-        The created variable symbol.
+        A symbol corresponding to an input to the computation graph.
     """
     if not isinstance(name, string_types):
         raise TypeError('Expect a string for variable `name`')
@@ -1011,9 +1155,11 @@ def Variable(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None
     ret._set_attr(**attr)
     return ret
 
+# for back compatibility
+Variable = var
 
 def Group(symbols):
-    """Create a symbol that groups symbols together.
+    """Creates a symbol that contains a collection of other symbols, grouped together.
 
     Parameters
     ----------
@@ -1023,12 +1169,12 @@ def Group(symbols):
     Returns
     -------
     sym : Symbol
-        The created group symbol.
+        A group symbol.
      """
     ihandles = []
     for sym in symbols:
         if not isinstance(sym, Symbol):
-            raise TypeError('Expect Symbols in the list input')
+            raise TypeError('Expected a list of symbols as input')
         ihandles.append(sym.handle)
     handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateGroup(
@@ -1088,7 +1234,7 @@ def load_json(json_str):
     Symbol.tojson : Used to save symbol into json string.
     """
     if not isinstance(json_str, string_types):
-        raise TypeError('fname need to be string')
+        raise TypeError('fname required to be string')
     handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateFromJSON(c_str(json_str), ctypes.byref(handle)))
     return Symbol(handle)
@@ -1202,8 +1348,7 @@ def hypot(left, right):
 
 
 def zeros(shape, dtype=None, **kwargs):
-    """Create a Tensor filled with zeros, similar to numpy.zeros
-        See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.zeros.html.
+    """Return a new symbol of given shape and type, filled with zeros.
 
     Parameters
     ----------
@@ -1223,8 +1368,7 @@ def zeros(shape, dtype=None, **kwargs):
 
 
 def ones(shape, dtype=None, **kwargs):
-    """Create a Tensor filled with ones, similar to numpy.ones
-        See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.ones.html.
+    """Return a new symbol of given shape and type, filled with ones.
 
     Parameters
     ----------
@@ -1244,8 +1388,7 @@ def ones(shape, dtype=None, **kwargs):
 
 
 def arange(start, stop=None, step=1.0, repeat=1, name=None, dtype=None):
-    """Simlar function in the MXNet ndarray as numpy.arange
-        See Also https://docs.scipy.org/doc/numpy/reference/generated/numpy.arange.html.
+    """Return evenly spaced values within a given interval.
 
     Parameters
     ----------
diff --git a/python/mxnet/symbol_doc.py b/python/mxnet/symbol_doc.py
index 9cf339c9ea26..5570a52371e3 100644
--- a/python/mxnet/symbol_doc.py
+++ b/python/mxnet/symbol_doc.py
@@ -18,9 +18,9 @@
 
 - all the global names in this file (e.g. `SymbolDoc`)
 - all the operators (e.g. `FullyConnected`)
-- the name `test_utils` for `mxnet.test_utils` (e.g. `test_utils.reldiff`)
-- the name `mxnet` (e.g. `mxnet.nd.zeros`)
-- the name `numpy`
+- the name `test_utils` for `mx.test_utils` (e.g. `test_utils.reldiff`)
+- the name `mx` (e.g. `mx.nd.zeros`)
+- the name `np`
 
 The following documents are recommended:
 
@@ -63,10 +63,10 @@ class ActivationDoc(SymbolDoc):
     ReLU activation
 
     >>> test_suites = [
-    ...     ('relu', lambda x: numpy.maximum(x, 0)),
-    ...     ('sigmoid', lambda x: 1 / (1 + numpy.exp(-x))),
-    ...     ('tanh', lambda x: numpy.tanh(x)),
-    ...     ('softrelu', lambda x: numpy.log(1 + numpy.exp(x)))
+    ...     ('relu', lambda x: np.maximum(x, 0)),
+    ...     ('sigmoid', lambda x: 1 / (1 + np.exp(-x))),
+    ...     ('tanh', lambda x: np.tanh(x)),
+    ...     ('softrelu', lambda x: np.log(1 + np.exp(x)))
     ... ]
     >>> x = test_utils.random_arrays((2, 3, 4))
     >>> for act_type, numpy_impl in test_suites:
@@ -93,17 +93,17 @@ class DropoutDoc(SymbolDoc):
     Regression Test
     ---------------
     >>> shape = (100, 100)  # take larger shapes to be more statistical stable
-    >>> x = numpy.ones(shape)
+    >>> x = np.ones(shape)
     >>> op = Dropout(p=0.5, name='dp')
     >>> # dropout is identity during testing
     >>> y = test_utils.simple_forward(op, dp_data=x, is_train=False)
-    >>> test_utils.almost_equal(x, y, threshold=0)
+    >>> test_utils.almost_equal(x, y)
     True
     >>> y = test_utils.simple_forward(op, dp_data=x, is_train=True)
     >>> # expectation is (approximately) unchanged
-    >>> numpy.abs(x.mean() - y.mean()) < 0.1
+    >>> np.abs(x.mean() - y.mean()) < 0.1
     True
-    >>> set(numpy.unique(y)) == set([0, 2])
+    >>> set(np.unique(y)) == set([0, 2])
     True
     """
 
@@ -130,7 +130,7 @@ class EmbeddingDoc(SymbolDoc):
     >>> batch_size = 12
     >>> word_vecs = test_utils.random_arrays((vocab_size, embed_dim))
     >>> op = Embedding(name='embed', input_dim=vocab_size, output_dim=embed_dim)
-    >>> x = numpy.random.choice(vocab_size, batch_size)
+    >>> x = np.random.choice(vocab_size, batch_size)
     >>> y = test_utils.simple_forward(op, embed_data=x, embed_weight=word_vecs)
     >>> y_np = word_vecs[x]
     >>> test_utils.almost_equal(y, y_np)
@@ -157,7 +157,7 @@ class FlattenDoc(SymbolDoc):
     >>> for dims in test_dims:
     ...     x = test_utils.random_arrays(dims)
     ...     y = test_utils.simple_forward(op, flat_data=x)
-    ...     y_np = x.reshape((dims[0], numpy.prod(dims[1:])))
+    ...     y_np = x.reshape((dims[0], np.prod(dims[1:]).astype('int32')))
     ...     print('%s: %s' % (dims, test_utils.almost_equal(y, y_np)))
     (2, 3, 4, 5): True
     (2, 3): True
@@ -198,7 +198,7 @@ class FullyConnectedDoc(SymbolDoc):
     >>> op = FullyConnected(num_hidden=dim_out, name='FC')
     >>> out = test_utils.simple_forward(op, FC_data=x, FC_weight=w, FC_bias=b)
     >>> # numpy implementation of FullyConnected
-    >>> out_np = numpy.dot(x, w.T) + b
+    >>> out_np = np.dot(x, w.T) + b
     >>> test_utils.almost_equal(out, out_np)
     True
     """
@@ -222,12 +222,13 @@ def _build_doc(func_name,
                '    Name of the resulting symbol.\n\n' +
                'Returns\n' +
                '-------\n' +
-               'symbol: Symbol\n' +
+               'Symbol\n' +
                '    The result symbol.')
     doc_str = doc_str % (desc, param_str)
     extra_doc = "\n" + '\n'.join([x.__doc__ for x in type.__subclasses__(SymbolDoc)
                                   if x.__name__ == '%sDoc' % func_name])
     doc_str += _re.sub(_re.compile("    "), "", extra_doc)
+    doc_str = _re.sub('ndarray-or-symbol', 'Symbol', doc_str)
     return doc_str
 
 
@@ -261,8 +262,8 @@ class BroadcastPlusDoc(SymbolDoc):
 
     Normal summation with matching shapes:
 
-    >>> dev = mxnet.context.cpu();
-    >>> x = c.bind(dev, args={'a': mxnet.nd.ones((2, 2)), 'b' : mxnet.nd.ones((2, 2))})
+    >>> dev = mx.context.cpu();
+    >>> x = c.bind(dev, args={'a': mx.nd.ones((2, 2)), 'b' : mx.nd.ones((2, 2))})
     >>> x.forward()
     [<NDArray 2x2 @cpu(0)>]
     >>> print x.outputs[0].asnumpy()
@@ -271,21 +272,21 @@ class BroadcastPlusDoc(SymbolDoc):
 
     Broadcasting:
 
-    >>> x = c.bind(dev, args={'a': mxnet.nd.ones((2, 2)), 'b' : mxnet.nd.ones((1, 1))})
+    >>> x = c.bind(dev, args={'a': mx.nd.ones((2, 2)), 'b' : mx.nd.ones((1, 1))})
     >>> x.forward()
     [<NDArray 2x2 @cpu(0)>]
     >>> print x.outputs[0].asnumpy()
     [[ 2.  2.]
      [ 2.  2.]]
 
-    >>> x = c.bind(dev, args={'a': mxnet.nd.ones((2, 1)), 'b' : mxnet.nd.ones((1, 2))})
+    >>> x = c.bind(dev, args={'a': mx.nd.ones((2, 1)), 'b' : mx.nd.ones((1, 2))})
     >>> x.forward()
     [<NDArray 2x2 @cpu(0)>]
     >>> print x.outputs[0].asnumpy()
     [[ 2.  2.]
      [ 2.  2.]]
 
-    >>> x = c.bind(dev, args={'a': mxnet.nd.ones((1, 2)), 'b' : mxnet.nd.ones((2, 1))})
+    >>> x = c.bind(dev, args={'a': mx.nd.ones((1, 2)), 'b' : mx.nd.ones((2, 1))})
     >>> x.forward()
     [<NDArray 2x2 @cpu(0)>]
     >>> print x.outputs[0].asnumpy()
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index dfeee078b1d4..0e93d63cd2f8 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -5,13 +5,21 @@
 import time
 import traceback
 import numbers
+import subprocess
+import os
+import errno
+import logging
 import numpy as np
 import numpy.testing as npt
 import mxnet as mx
-
 from .context import cpu, gpu, Context
 from .ndarray import array
 from .symbol import Symbol
+try:
+    import requests
+except ImportError:
+    # in rare cases requests may be not installed
+    pass
 
 _rng = np.random.RandomState(1234)
 
@@ -24,7 +32,7 @@ def default_context():
 
 
 def set_default_context(ctx):
-    """Set default ctx"""
+    """Set default ``ctx``."""
     Context.default_ctx = ctx
 
 
@@ -58,18 +66,18 @@ def random_arrays(*shapes):
 
 
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
-    """Compatible reduce for old version numpy
+    """Compatible reduce for old version of NumPy.
 
     Parameters
     ----------
     dat : np.ndarray
-        Same as Numpy
+        Same as NumPy.
 
     axis : None or int or list-like
-        Same as Numpy
+        Same as NumPy.
 
     keepdims : bool
-        Same as Numpy
+        Same as Numpy.
 
     numpy_reduce_func : function
         Numpy reducing function like `np.sum` or `np.max`
@@ -90,7 +98,7 @@ def np_reduce(dat, axis, keepdims, numpy_reduce_func):
 
 
 def find_max_violation(a, b, rtol=None, atol=None):
-    """find location of maximum violation"""
+    """Find the location of maximum violation."""
     rtol = get_rtol(rtol)
     atol = get_atol(atol)
     diff = np.abs(a-b)
@@ -102,7 +110,7 @@ def find_max_violation(a, b, rtol=None, atol=None):
 
 
 def same(a, b):
-    """Test if two numpy arrays are the same
+    """Test if two NumPy arrays are the same.
 
     Parameters
     ----------
@@ -125,7 +133,7 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b')):
     a : np.ndarray
     b : np.ndarray
     threshold : None or float
-        The checking threshold. Default threshold will be used if set to None
+        The checking threshold. Default threshold will be used if set to ``None``.
     """
     rtol = get_rtol(rtol)
     atol = get_atol(atol)
@@ -155,9 +163,9 @@ def almost_equal_ignore_nan(a, b, rtol=None, atol=None):
     a : np.ndarray
     b : np.ndarray
     rtol : None or float
-        The relative threshold. Default threshold will be used if set to None
+        The relative threshold. Default threshold will be used if set to ``None``.
     atol : None or float
-        The absolute threshold. Default threshold will be used if set to None
+        The absolute threshold. Default threshold will be used if set to ``None``.
     """
     a = np.copy(a)
     b = np.copy(b)
@@ -179,9 +187,9 @@ def assert_almost_equal_ignore_nan(a, b, rtol=None, atol=None, names=('a', 'b'))
     a : np.ndarray
     b : np.ndarray
     rtol : None or float
-        The relative threshold. Default threshold will be used if set to None
+        The relative threshold. Default threshold will be used if set to ``None``.
     atol : None or float
-        The absolute threshold. Default threshold will be used if set to None
+        The absolute threshold. Default threshold will be used if set to ``None``.
     """
     a = np.copy(a)
     b = np.copy(b)
@@ -193,12 +201,12 @@ def assert_almost_equal_ignore_nan(a, b, rtol=None, atol=None, names=('a', 'b'))
 
 
 def retry(n):
-    """Retry n times before failing for stochastic test cases"""
+    """Retry n times before failing for stochastic test cases."""
     assert n > 0
     def decorate(f):
-        """Decorate a test case"""
+        """Decorate a test case."""
         def wrapper(*args, **kwargs):
-            """Wrapper for tests function"""
+            """Wrapper for tests function."""
             for _ in range(n):
                 try:
                     f(*args, **kwargs)
@@ -214,21 +222,20 @@ def wrapper(*args, **kwargs):
 def simple_forward(sym, ctx=None, is_train=False, **inputs):
     """A simple forward function for a symbol.
 
-    Primarily used in doctest to conveniently test the function
-    of a symbol. Takes numpy array as inputs and outputs are
-    also converted to numpy arrays.
+    Primarily used in doctest to test the functionality of a symbol.
+    Takes NumPy arrays as inputs and outputs are also converted to NumPy arrays.
 
     Parameters
     ----------
     ctx : Context
-        If None, will take the default context.
+        If ``None``, will take the default context.
     inputs : keyword arguments
-        Mapping each input name to a numpy array.
+        Mapping each input name to a NumPy array.
 
     Returns
     -------
     The result as a numpy array. Multiple results will
-    be returned as a list of numpy arrays.
+    be returned as a list of NumPy arrays.
     """
     ctx = ctx or default_context()
     inputs = {k: array(v) for k, v in inputs.iteritems()}
@@ -241,16 +248,16 @@ def simple_forward(sym, ctx=None, is_train=False, **inputs):
 
 
 def _parse_location(sym, location, ctx):
-    """Parse the given location to a dictionary
+    """Parse the given location to a dictionary.
 
     Parameters
     ----------
     sym : Symbol
-    location : None or list of np.ndarray or dict of str to np.ndarray
+    location : ``None`` or list of ``np.ndarray`` or dict of str to np.ndarray
 
     Returns
     -------
-    dict of str to np.ndarray
+    dict of str to np.ndarray.
     """
     assert isinstance(location, (dict, list, tuple))
     if isinstance(location, dict):
@@ -270,11 +277,11 @@ def _parse_aux_states(sym, aux_states, ctx):
     Parameters
     ----------
     sym : Symbol
-    aux_states : None or list of np.ndarray or dict of str to np.ndarray
+    aux_states : None or list of np.ndarray or dict of str to np.ndarray.
 
     Returns
     -------
-    dict of str to np.ndarray
+    dict of str to np.ndarray.
     """
     if aux_states is not None:
         if isinstance(aux_states, dict):
@@ -298,7 +305,7 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai
     Parameters
     ----------
     executor : Executor
-        exectutor that computes the forward pass
+        Executor that computes the forward pass.
     location : list of numpy.ndarray or dict of str to numpy.ndarray
         Argument values used as location to compute gradient
         Maps the name of arguments to the corresponding numpy.ndarray.
@@ -308,7 +315,7 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai
         Maps the name of aux_states to the corresponding numpy.ndarray.
         Value of all the auxiliary arguments must be provided.
     eps : float, optional
-        epsilon for the finite-difference method
+        Epsilon for the finite-difference method.
     use_forward_train : bool, optional
         Whether to use `is_train=True` in testing.
     References
@@ -322,6 +329,8 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai
     for k in location:
         location[k] = np.ascontiguousarray(location[k])
     for k, v in location.items():
+        if v.dtype.kind != 'f':
+            continue
         old_value = v.copy()
         for i in range(np.prod(v.shape)):
             # inplace update
@@ -803,3 +812,73 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
                         print(str(e))
 
     return gt
+
+def list_gpus():
+    """Return a list of GPUs
+
+    Returns
+    -------
+    list of int:
+        If there are n GPUs, then return a list [0,1,...,n-1]. Otherwise returns
+        [].
+    """
+    re = ''
+    nvidia_smi = ['nvidia-smi', '/usr/bin/nvidia-smi', '/usr/local/nvidia/bin/nvidia-smi']
+    for cmd in nvidia_smi:
+        try:
+            re = subprocess.check_output([cmd, "-L"], universal_newlines=True)
+        except OSError:
+            pass
+    return range(len([i for i in re.split('\n') if 'GPU' in i]))
+
+def download(url, fname=None, dirname=None, overwrite=False):
+    """Download an given URL
+
+    Parameters
+    ----------
+
+    url : str
+        URL to download
+    fname : str, optional
+        filename of the downloaded file. If None, then will guess a filename
+        from url.
+    dirname : str, optional
+        output directory name. If None, then guess from fname or use the current
+        directory
+    overwrite : bool, optional
+        Default is false, which means skipping download if the local file
+        exists. If true, then download the url to overwrite the local file if
+        exists.
+
+    Returns
+    -------
+    str
+        The filename of the downloaded file
+    """
+    if fname is None:
+        fname = url.split('/')[-1]
+    if not overwrite and os.path.exists(fname):
+        logging.info("%s exists, skip to downloada", fname)
+        return fname
+
+    if dirname is None:
+        dirname = os.path.dirname(fname)
+    else:
+        fname = os.path.join(dirname, fname)
+    if dirname != "":
+        if not os.path.exists(dirname):
+            try:
+                logging.info('create directory %s', dirname)
+                os.makedirs(dirname)
+            except OSError as exc:
+                if exc.errno != errno.EEXIST:
+                    raise OSError('failed to create ' + dirname)
+
+    r = requests.get(url, stream=True)
+    assert r.status_code == 200, "failed to open %s" % url
+    with open(fname, 'wb') as f:
+        for chunk in r.iter_content(chunk_size=1024):
+            if chunk: # filter out keep-alive new chunks
+                f.write(chunk)
+    logging.info("downloaded %s into %s successfully", url, fname)
+    return fname
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
index 34d510f1ec33..916f345891cd 100644
--- a/python/mxnet/torch.py
+++ b/python/mxnet/torch.py
@@ -1,6 +1,6 @@
 # coding: utf-8
 """Interface for NDArray functions executed by torch backend.
-Install torch and Compile with USE_TORCH=1 to use this module"""
+Install torch and Compile with USE_TORCH=1 to use this module."""
 from __future__ import absolute_import
 
 import ctypes
@@ -74,12 +74,12 @@ def _make_torch_function(handle):
                     res=res))
 
     def generic_torch_function(*args, **kwargs):
-        """Invoke this function by passing in parameters
+        """Invoke this function by passing in parameters.
 
         Parameters
         ----------
         *args
-            Positional arguments of input scalars and NDArray
+            Positional arguments of input scalars and NDArray.
 
         Returns
         -------
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index a1cf52c77449..7fee29acd0f6 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -13,7 +13,7 @@
 from .symbol import Symbol
 
 def _str2tuple(string):
-    """convert shape string to list, internal use only
+    """Convert shape string to list, internal use only.
 
     Parameters
     ----------
@@ -22,23 +22,23 @@ def _str2tuple(string):
 
     Returns
     -------
-    list of str to represent shape
+    List of str to represent shape.
     """
     return re.findall(r"\d+", string)
 
 def print_summary(symbol, shape=None, line_length=120, positions=[.44, .64, .74, 1.]):
-    """convert symbol for detail information
+    """Convert symbol for detail information.
 
     Parameters
     ----------
     symbol: Symbol
-        symbol to be visualized
+        Symbol to be visualized.
     shape: dict
-        dict of shapes, str->shape (tuple), given input shapes
+        dict of shapes, str->shape (tuple), given input shapes.
     line_length: int
         total length of printed lines
     positions: list
-        relative or absolute positions of log elements in each line
+        Relative or absolute positions of log elements in each line.
     Returns
     ------
         void
@@ -46,7 +46,7 @@ def print_summary(symbol, shape=None, line_length=120, positions=[.44, .64, .74,
     if not isinstance(symbol, Symbol):
         raise TypeError("symbol must be Symbol")
     show_shape = False
-    if shape != None:
+    if shape is not None:
         show_shape = True
         interals = symbol.get_internals()
         _, out_shapes, _ = interals.infer_shape(**shape)
@@ -61,14 +61,14 @@ def print_summary(symbol, shape=None, line_length=120, positions=[.44, .64, .74,
     # header names for the different log elements
     to_display = ['Layer (type)', 'Output Shape', 'Param #', 'Previous Layer']
     def print_row(fields, positions):
-        """print format row
+        """Print format row.
 
         Parameters
         ----------
         fields: list
-            information field
+            Information field.
         positions: list
-            field length ratio
+            Field length ratio.
         Returns
         ------
             void
@@ -88,12 +88,12 @@ def print_layer_summary(node, out_shape):
         Parameters
         ----------
         node: dict
-            node information
+            Node information.
         out_shape: dict
-            node shape information
+            Node shape information.
         Returns
         ------
-            node total parameters
+            Node total parameters.
         """
         op = node["op"]
         pre_node = []
@@ -166,12 +166,12 @@ def print_layer_summary(node, out_shape):
 
 def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs={},
                  hide_weights=True):
-    """convert symbol to dot object for visualization
+    """Convert symbol to dot object for visualization.
 
     Parameters
     ----------
     title: str
-        title of the dot graph
+        Title of the dot graph.
     symbol: Symbol
         symbol to be visualized
     shape: dict
@@ -179,18 +179,18 @@ def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs
         of each tensor on the edges between nodes.
         This is a dict of shapes, str->shape (tuple), given input shapes
     node_attrs: dict
-        dict of node's attributes
+        dict of node's attributes.
         for example:
             node_attrs={"shape":"oval","fixedsize":"fasle"}
             means to plot the network in "oval"
     hide_weights: bool
-        if True (default) then inputs with names like `*_weight`
-        or `*_bias` will be hidden
+        If True (default), then inputs with names like `*_weight`
+        or `*_bias` will be hidden.
 
     Returns
     ------
     dot: Diagraph
-        dot object of symbol
+        The dot object of symbol.
     """
     # todo add shape support
     try:
@@ -198,9 +198,9 @@ def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs
     except:
         raise ImportError("Draw network requires graphviz library")
     if not isinstance(symbol, Symbol):
-        raise TypeError("symbol must be Symbol")
+        raise TypeError("symbol must be a Symbol")
     draw_shape = False
-    if shape != None:
+    if shape is not None:
         draw_shape = True
         interals = symbol.get_internals()
         _, out_shapes, _ = interals.infer_shape(**shape)
@@ -220,7 +220,7 @@ def plot_network(symbol, title="plot", save_format='pdf', shape=None, node_attrs
           "#fdb462", "#b3de69", "#fccde5")
 
     def looks_like_weight(name):
-        """Internal helper to figure out if node should be hidden with hide_weights
+        """Internal helper to figure out if node should be hidden with ``hide_weights``.
         """
         if name.endswith("_weight"):
             return True
@@ -282,7 +282,7 @@ def looks_like_weight(name):
         dot.node(name=name, label=label, **attr)
 
     # add edges
-    for node in nodes:
+    for node in nodes:          # pylint: disable=too-many-nested-blocks
         op = node["op"]
         name = node["name"]
         if op == "null":
@@ -298,6 +298,11 @@ def looks_like_weight(name):
                     if draw_shape:
                         if input_node["op"] != "null":
                             key = input_name + "_output"
+                            if "attr" in input_node:
+                                params = input_node["attr"]
+                                if "num_outputs" in params:
+                                    key += str(int(params["num_outputs"]) - 1)
+                                    params["num_outputs"] = int(params["num_outputs"]) - 1
                             shape = shape_dict[key][1:]
                             label = "x".join([str(x) for x in shape])
                             attr["label"] = label
diff --git a/python/setup.py b/python/setup.py
index 16de392b0b2b..d56ae7517a79 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -24,6 +24,8 @@
 
 def config_cython():
     """Try to configure cython and return cython configuration"""
+    return [] # disable cython due to some users have compile errors.
+    # pylint: disable=unreachable
     if os.name == 'nt':
         print("WARNING: Cython is not supported on Windows, will compile without cython module")
         return []
@@ -69,7 +71,7 @@ def config_cython():
       zip_safe=False,
       packages=[
           'mxnet', 'mxnet.module', 'mxnet._ctypes', 'mxnet.rnn',
-          'mxnet._cy2', 'mxnet._cy3', 'mxnet.notebook'
+          'mxnet._cy2', 'mxnet._cy3', 'mxnet.notebook', 'mxnet.contrib'
           ],
       data_files=[('mxnet', [LIB_PATH[0]])],
       url='https://github.com/dmlc/mxnet',
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index b3a9a30ad647..e4937e9681b5 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -60,7 +60,7 @@
           <argLine>
             -Djava.library.path=${project.parent.basedir}/native/${platform}/target \
             -Dlog4j.configuration=file://${project.basedir}/src/test/resources/log4j.properties
-          </argLine> 
+          </argLine>
         </configuration>
       </plugin>
       <plugin>
diff --git a/scala-package/core/scripts/get_cifar_data.sh b/scala-package/core/scripts/get_cifar_data.sh
index dbb9fe68a7bb..eba3a27805fa 100755
--- a/scala-package/core/scripts/get_cifar_data.sh
+++ b/scala-package/core/scripts/get_cifar_data.sh
@@ -1,9 +1,18 @@
-data_path="./data"
+#!/bin/bash
+
+set -e
+
+if [ ! -z "$MXNET_DATA_DIR" ]; then
+  data_path="$MXNET_DATA_DIR"
+else
+  data_path="./data"
+fi
+
 if [ ! -d "$data_path" ]; then
   mkdir -p "$data_path"
 fi
 
-cifar_data_path="./data/cifar10.zip"
+cifar_data_path="$data_path/cifar10.zip"
 if [ ! -f "$cifar_data_path" ]; then
   wget http://data.mxnet.io/mxnet/data/cifar10.zip -P $data_path
   cd $data_path
diff --git a/scala-package/core/scripts/get_mnist_data.sh b/scala-package/core/scripts/get_mnist_data.sh
index 63978ef8e3c9..a4cfe11e4b5a 100755
--- a/scala-package/core/scripts/get_mnist_data.sh
+++ b/scala-package/core/scripts/get_mnist_data.sh
@@ -1,9 +1,18 @@
-data_path="./data"
+#!/bin/bash
+
+set -e
+
+if [ ! -z "$MXNET_DATA_DIR" ]; then
+  data_path="$MXNET_DATA_DIR"
+else
+  data_path="./data"
+fi
+
 if [ ! -d "$data_path" ]; then
   mkdir -p "$data_path"
 fi
 
-mnist_data_path="./data/mnist.zip"
+mnist_data_path="$data_path/mnist.zip"
 if [ ! -f "$mnist_data_path" ]; then
   wget http://data.mxnet.io/mxnet/data/mnist.zip -P $data_path
   cd $data_path
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/AttrScope.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/AttrScope.scala
index 4dac069bc327..eaded0010d53 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/AttrScope.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/AttrScope.scala
@@ -20,9 +20,8 @@ package ml.dmlc.mxnet
 /**
  * Attribute manager for scoping.
  * User can also inherit this object to change naming behavior.
- * @author Yizhi Liu
  */
-class AttrScope(attr: Map[String, String] = Map.empty) {
+private[mxnet] class AttrScope(attr: Map[String, String] = Map.empty) {
   private var _attr = attr
   /**
    * Get the attribute dict given the attribute set by the symbol.
@@ -45,7 +44,7 @@ class AttrScope(attr: Map[String, String] = Map.empty) {
   }
 }
 
-object AttrScope {
+private[mxnet] object AttrScope {
   private var _current = new AttrScope()
   def current: AttrScope = _current
   private def setCurrentAttr(attr: AttrScope): Unit = {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
index fba437ddb23f..5c67b85a87b4 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Base.scala
@@ -20,7 +20,7 @@ package ml.dmlc.mxnet
 import ml.dmlc.mxnet.util.NativeLibraryLoader
 import org.slf4j.{LoggerFactory, Logger}
 
-object Base {
+private[mxnet] object Base {
   private val logger: Logger = LoggerFactory.getLogger("MXNetJVM")
 
   // type definitions
@@ -152,4 +152,4 @@ object Base {
   }
 }
 
-class MXNetError(val err: String) extends Exception(err)
+private[mxnet] class MXNetError(val err: String) extends Exception(err)
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Callback.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Callback.scala
index e3e424695a23..285989c1f667 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Callback.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Callback.scala
@@ -21,10 +21,10 @@ import org.slf4j.{Logger, LoggerFactory}
 
 /**
  * Callback functions that can be used to track various status during epoch.
- * @author Yizhi Liu
  */
 object Callback {
-  class Speedometer(val batchSize: Int, val frequent: Int = 50) extends BatchEndCallback {
+
+  class Speedometer(batchSize: Int, frequent: Int = 50) extends BatchEndCallback {
     private val logger: Logger = LoggerFactory.getLogger(classOf[Speedometer])
     private var init = false
     private var tic: Long = 0L
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Context.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Context.scala
index a51895232c5a..8833f31a007a 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Context.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Context.scala
@@ -37,7 +37,7 @@ object Context {
 
 /**
  * Constructing a context.
- * @author Yizhi Liu
+
  * @param deviceTypeName {'cpu', 'gpu'} String representing the device type
  * @param deviceId (default=0) The device id of the device, needed for GPU
  */
@@ -67,4 +67,17 @@ class Context(deviceTypeName: String, val deviceId: Int = 0) extends Serializabl
   override def toString: String = {
     s"$deviceType($deviceId)"
   }
+
+  override def equals(other: Any): Boolean = {
+    if (other != null && other.isInstanceOf[Context]) {
+      val otherInst = other.asInstanceOf[Context]
+      otherInst.deviceId == deviceId && otherInst.deviceTypeid == deviceTypeid
+    } else {
+      false
+    }
+  }
+
+  override def hashCode: Int = {
+    toString.hashCode
+  }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
index e2928c847ef5..c9e5b07f4b7d 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/EvalMetric.scala
@@ -20,8 +20,6 @@ package ml.dmlc.mxnet
 /**
  * Base class of all evaluation metrics
  * @param name Metric name
- *
- * @author Yuan Tang, Yizhi Liu, Depeng Liang
  */
 abstract class EvalMetric(protected val name: String) {
 
@@ -226,8 +224,8 @@ class RMSE extends EvalMetric("rmse") {
  * @param fEval Customized evaluation function.
  * @param name The name of the metric
  */
-class CustomMetric(private val fEval: (NDArray, NDArray) => Float,
-                   override val name: String) extends EvalMetric(name) {
+class CustomMetric(fEval: (NDArray, NDArray) => Float,
+                   name: String) extends EvalMetric(name) {
   override def update(labels: IndexedSeq[NDArray], preds: IndexedSeq[NDArray]): Unit = {
     require(labels.size == preds.size, "labels and predictions should have the same length.")
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/ExecutorManager.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/ExecutorManager.scala
index a96aa82f4e74..5c4e47a97986 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/ExecutorManager.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/ExecutorManager.scala
@@ -36,7 +36,7 @@ import scala.collection.mutable.ArrayBuffer
  * @param workLoadList The list of work load for different devices, in the same order as ctx
  * @param symGen symbol generator for bucketing
  */
-class DataParallelExecutorManager(private val symbol: Symbol,
+private[mxnet] class DataParallelExecutorManager(private val symbol: Symbol,
                                   private val ctx: Array[Context],
                                   private[mxnet] val paramNames: IndexedSeq[String],
                                   private[mxnet] val argNames: IndexedSeq[String],
@@ -169,12 +169,11 @@ class DataParallelExecutorManager(private val symbol: Symbol,
   }
 }
 
-object DataParallelExecutorManager {
+private object DataParallelExecutorManager {
   val logger: Logger = LoggerFactory.getLogger(classOf[DataParallelExecutorManager])
 }
 
-class ExecutorManager
-object ExecutorManager {
+private[mxnet] object ExecutorManager {
   /**
    * Get input slice from the input shape.
    * @param batchSize The number of samples in a mini-batch.
@@ -387,7 +386,7 @@ object ExecutorManager {
  *                   An existing executor group, if to share parameters with it.
  *
  */
-class DataParallelExecutorGroup private(sym: Symbol,
+private class DataParallelExecutorGroup private(sym: Symbol,
                                 argNames: IndexedSeq[String], paramNames: Set[String],
                                 ctx: Array[Context], private val slices: Array[(Int, Int)],
                                 providedData: Map[String, Shape],
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/FeedForward.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/FeedForward.scala
index 0b003365fa00..a3237fe2a2f9 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/FeedForward.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/FeedForward.scala
@@ -45,16 +45,18 @@ import scala.collection.mutable.ListBuffer
  * @param beginEpoch The beginning training epoch.
  */
 class FeedForward private(
-    private var symbol: Symbol, private val symGen: SymbolGenerator,
-    val ctx: Array[Context],
-    val numEpoch: Int, val epochSize: Int,
-    val optimizer: Optimizer,
-    val initializer: Initializer,
-    val batchSize: Int,
+    private var symbol: Symbol,
+    symGen: SymbolGenerator,
+    ctx: Array[Context],
+    numEpoch: Int, val epochSize: Int,
+    optimizer: Optimizer,
+    initializer: Initializer,
+    batchSize: Int,
     argParams: Map[String, NDArray],
     auxParams: Map[String, NDArray],
     private val allowExtraParams: Boolean,
     val beginEpoch: Int) {
+
   val logger: Logger = LoggerFactory.getLogger(classOf[FeedForward])
   private var argumentChecked = false
   private var _argParams = argParams
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/IO.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/IO.scala
index 1d9d9b664a3d..7bc936fc1249 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/IO.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/IO.scala
@@ -18,6 +18,7 @@
 package ml.dmlc.mxnet
 
 import ml.dmlc.mxnet.Base._
+import ml.dmlc.mxnet.DType.DType
 import ml.dmlc.mxnet.io.{MXDataPack, MXDataIter}
 import org.slf4j.LoggerFactory
 
@@ -26,7 +27,6 @@ import scala.collection.mutable.ListBuffer
 
 /**
  * IO iterators for loading training & validation data
- * @author Zixuan Huang, Yizhi Liu
  */
 object IO {
   type IterCreateFunc = (Map[String, String]) => DataIter
@@ -227,4 +227,28 @@ abstract class DataPack() extends Iterable[DataBatch] {
   def iterator: DataIter
 }
 
+// Named data desc description contains name, shape, type and other extended attributes.
+case class DataDesc(name: String, shape: Shape,
+                    dtype: DType = Base.MX_REAL_TYPE, layout: String = "NCHW") {
+  override def toString(): String = {
+    s"DataDesc[$name,$shape,$dtype,$layout]"
+  }
+}
+
+object DataDesc {
+  /**
+   * Get the dimension that corresponds to the batch size.
+   * @param layout layout string. For example, "NCHW".
+   * @return An axis indicating the batch_size dimension. When data-parallelism is used,
+   *         the data will be automatically split and concatenate along the batch_size dimension.
+   *         Axis can be -1, which means the whole array will be copied
+   *         for each data-parallelism device.
+   */
+  def getBatchAxis(layout: Option[String]): Int = {
+    layout.map(_.indexOf('N')).getOrElse(0)
+  }
 
+  implicit def ListMap2Descs(shapes: ListMap[String, Shape]): IndexedSeq[DataDesc] = {
+    shapes.map { case (k, s) => new DataDesc(k, s) }.toIndexedSeq
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Initializer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Initializer.scala
index 5f2f9a68f6b5..ad85969643ad 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Initializer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Initializer.scala
@@ -20,8 +20,6 @@ package ml.dmlc.mxnet
 /**
  *
  * Base class for Initializer.
- *
- * @author Yuan Tang
  */
 abstract class Initializer {
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala
index 53c3cd4e8c9a..32e0acec1572 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStore.scala
@@ -17,6 +17,8 @@
 
 package ml.dmlc.mxnet
 
+import java.io._
+
 import ml.dmlc.mxnet.Base._
 import org.slf4j.{LoggerFactory, Logger}
 
@@ -264,5 +266,44 @@ class KVStore(private[mxnet] val handle: KVStoreHandle) {
   private def sendCommandToServers(head: Int, body: String): Unit = {
     checkCall(_LIB.mxKVStoreSendCommmandToServers(handle, head, body))
   }
+
+  /**
+   * Save optimizer (updater) state to file
+   * @param fname Path to output states file.
+   */
+  def saveOptimizerStates(fname: String): Unit = {
+    require(updaterFunc != null, "Cannot save states for distributed training")
+    updaterFunc match {
+      case cachedStates: MXKVStoreCachedStates =>
+        val target = new BufferedOutputStream(new FileOutputStream(fname))
+        try {
+          target.write(cachedStates.serializeState())
+        } finally {
+          target.close()
+        }
+      case _ =>
+        logger.warn("Updater does not have states, skip saving to {}", fname)
+    }
+  }
+
+  /**
+   * Load optimizer (updater) state from file
+   * @param fname Path to input states file.
+   */
+  def loadOptimizerStates(fname: String): Unit = {
+    assert(updaterFunc != null, "Cannot load states for distributed training")
+    updaterFunc match {
+      case cachedStates: MXKVStoreCachedStates =>
+        val bis = new BufferedInputStream (new FileInputStream (fname) )
+        try {
+        val bArray = Stream.continually (bis.read).takeWhile (- 1 !=).map (_.toByte).toArray
+          cachedStates.deserializeState(bArray)
+        } finally {
+          bis.close ()
+        }
+      case _ =>
+        logger.warn("Updater does not have states, skip loading from {}", fname)
+    }
+  }
 }
 // scalastyle:off finalize
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
index 1a53aba73cf8..22f926972286 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
@@ -24,7 +24,7 @@ import org.slf4j.{Logger, LoggerFactory}
  * Server node for the key value store
  * @author Yizhi Liu
  */
-class KVStoreServer(private val kvStore: KVStore) {
+private[mxnet] class KVStoreServer(private val kvStore: KVStore) {
   private val logger: Logger = LoggerFactory.getLogger(classOf[KVStoreServer])
   private val handle: KVStoreHandle = kvStore.handle
   private val controller = new KVServerControllerCallback {
@@ -113,6 +113,6 @@ object KVStoreServer {
   }
 }
 
-trait KVServerControllerCallback {
+private[mxnet] trait KVServerControllerCallback {
   def invoke(cmdId: Int, cmdBody: String): Unit
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
index 9ec45c5d834f..f776117df8ed 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/LibInfo.scala
@@ -23,9 +23,8 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer}
 
 /**
  * JNI functions
- * @author Yizhi Liu
  */
-class LibInfo {
+private[mxnet] class LibInfo {
   @native def nativeLibInit(): Int
   @native def mxGetLastError(): String
   // Operators
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Model.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Model.scala
index a74011c18a4c..0bbcf0949ee3 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Model.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Model.scala
@@ -25,7 +25,6 @@ import scala.collection.mutable
 
 /**
  * Describe the model flow
- * @author Yizhi Liu
  */
 class Model
 object Model {
@@ -156,11 +155,11 @@ object Model {
   }
 
   // Initialize kvstore
-  private def initializeKVStore(kvStore: KVStore,
-                                paramArrays: IndexedSeq[Array[NDArray]],
-                                argParams: Map[String, NDArray],
-                                paramNames: IndexedSeq[String],
-                                updateOnKVStore: Boolean): Unit = {
+  private[mxnet] def initializeKVStore(kvStore: KVStore,
+                                       paramArrays: IndexedSeq[Array[NDArray]],
+                                       argParams: Map[String, NDArray],
+                                       paramNames: IndexedSeq[String],
+                                       updateOnKVStore: Boolean): Unit = {
     require(paramArrays.length == paramNames.length)
     for (idx <- 0 until paramArrays.length) {
       val paramOnDevs = paramArrays(idx)
@@ -172,9 +171,9 @@ object Model {
   }
 
   // Perform update of param_arrays from grad_arrays on kvstore
-  private def updateParamsOnKVStore(paramArrays: IndexedSeq[Array[NDArray]],
-                                    gradArrays: IndexedSeq[Array[NDArray]],
-                                    kvStore: Option[KVStore]): Unit = {
+  private[mxnet] def updateParamsOnKVStore(paramArrays: IndexedSeq[Array[NDArray]],
+                                           gradArrays: IndexedSeq[Array[NDArray]],
+                                           kvStore: Option[KVStore]): Unit = {
     (paramArrays zip gradArrays).zipWithIndex.foreach { case ((argList, gradList), index) =>
       if (gradList != null) {
         // push gradient, priority is negative index
@@ -186,11 +185,11 @@ object Model {
   }
 
   // Perform update of param_arrays from grad_arrays not on kvstore
-  private def updateParams(paramArrays: IndexedSeq[Array[NDArray]],
-                           gradArrays: IndexedSeq[Array[NDArray]],
-                           updater: MXKVStoreUpdater,
-                           numDevice: Int,
-                           kvStore: Option[KVStore] = None) {
+  private[mxnet] def updateParams(paramArrays: IndexedSeq[Array[NDArray]],
+                                  gradArrays: IndexedSeq[Array[NDArray]],
+                                  updater: MXKVStoreUpdater,
+                                  numDevice: Int,
+                                  kvStore: Option[KVStore] = None) {
     (paramArrays zip gradArrays).zipWithIndex.foreach { case ((argList, gradList), index) =>
       if (gradList != null) {
         kvStore.foreach(kv => {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala
index d20cdaa9eee5..634e56bc74e2 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Monitor.scala
@@ -25,14 +25,14 @@ import scala.collection.mutable
 /**
  * Monitor outputs, weights, and gradients for debugging.
  *
- * @author Yuan Tang, Yizhi Liu
- *
  * @param interval Number of batches between printing.
  * @param statFunc A function that computes statistics of tensors.
  *                 Takes a NDArray and returns a NDArray. defaults
  *                 to mean absolute value |x|/size(x).
  */
-class Monitor(protected val interval: Int, protected var statFunc: (NDArray) => NDArray = null) {
+class Monitor(
+    protected val interval: Int,
+    protected var statFunc: (NDArray) => NDArray = null) {
 
   private val logger = LoggerFactory.getLogger(classOf[Monitor])
 
@@ -127,6 +127,6 @@ class Monitor(protected val interval: Int, protected var statFunc: (NDArray) =>
 
 }
 
-trait MXMonitorCallback {
+private[mxnet] trait MXMonitorCallback {
   def invoke(name: String, arr: NDArrayHandle): Unit
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
index 5e8fedebd4ac..dce7c5b2878b 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NDArray.scala
@@ -29,7 +29,6 @@ import scala.ref.WeakReference
 
 /**
  * NDArray API of mxnet
- * @author Yizhi Liu, Yuan Tang
  */
 @AddNDArrayFunctions
 object NDArray {
@@ -171,7 +170,8 @@ object NDArray {
     checkCall(_LIB.mxSymbolGetAtomicSymbolInfo(
       handle, name, desc, numArgs, argNames, argTypes, argDescs, keyVarNumArgs))
     val arguments = (argTypes zip argNames).filter { case (dtype, _) =>
-      !(dtype.startsWith("NDArray") || dtype.startsWith("Symbol"))
+      !(dtype.startsWith("NDArray") || dtype.startsWith("Symbol")
+        || dtype.startsWith("ndarray-or-symbol"))
     }.map { case (_, argName) =>
       argName
     }
@@ -305,29 +305,54 @@ object NDArray {
   }
 
   /**
-   * Join a sequence of arrays at the first dimension
-   * TODO: shall we make it native?
-   * @param arrays
+   * Concatenate a list of NDArrays along the specified dimension.
+   * @param arrays Arrays to be concatenate.
+   *               They must have identical shape except the first dimension.
+   *               They also must have the same data type.
+   * @param axis The axis along which to concatenate.
+   * @param alwaysCopy Default `True`. When not `True`,
+   *                   if the arrays only contain one `NDArray`,
+   *                   that element will be returned directly, avoid copying.
+   * @return An `NDArray` that lives on the same context as `arrays[0].context`.
    */
-  def concatenate(arrays: Seq[NDArray], ctx: Context = null): NDArray = {
-    require(arrays != null && arrays.size > 0, "arrays empty")
-    val array0 = arrays.head
-    val shape = array0.shape.drop(1)
-    var axis0 = array0.shape(0)
-    arrays.drop(1).foreach { array =>
-      require(shape == array.shape.drop(1),
-        s"shape mismatch between ${array.shape} and $shape")
-      axis0 += array.shape(0)
-    }
+  def concatenate(arrays: Seq[NDArray], axis: Int = 0, alwaysCopy: Boolean = true): NDArray = {
+    require(arrays.size > 0)
 
-    val output = NDArray.empty(Shape(axis0) ++ shape, ctx)
-    axis0 = 0
-    arrays.foreach { array =>
-      output.slice(axis0, axis0 + array.shape(0)).set(array)
-      axis0 += array.shape(0)
+    val array0 = arrays(0)
+    if (!alwaysCopy && arrays.size == 1) {
+      array0
+    } else {
+      val shapeRest1 = array0.shape.slice(0, axis)
+      val shapeRest2 = array0.shape.slice(axis + 1, array0.shape.length)
+      val dtype = array0.dtype
+
+      val shapeAxis =
+        arrays.map(arr => {
+          require(shapeRest1 == arr.shape.slice(0, axis))
+          require(shapeRest2 == arr.shape.slice(axis + 1, arr.shape.length))
+          require(dtype == arr.dtype)
+          arr.shape(axis)
+        }).sum
+      val retShape = shapeRest1 ++ Shape(shapeAxis) ++ shapeRest2
+      val ret = NDArray.empty(retShape, ctx = array0.context, dtype = dtype)
+
+      var idx = 0
+      val begin = Array.fill(retShape.length)(0)
+      val end = retShape.toArray
+      for (arr <- arrays) {
+        if (axis == 0) {
+          ret.slice(idx, idx + arr.shape(0)).set(arr)
+        } else {
+          begin(axis) = idx
+          end(axis) = idx + arr.shape(axis)
+          NDArray._crop_assign(Map("out" -> ret,
+            "begin" -> Shape(begin),
+            "end" -> Shape(end)))(ret, arr)
+        }
+        idx += arr.shape(axis)
+      }
+      ret
     }
-
-    output
   }
 
   def concatenate(arrays: NDArray *): NDArray = {
@@ -475,10 +500,7 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
         if (excepts.contains(addr)) {
           true
         } else {
-          weak.get match {
-            case Some(arr) => arr.dispose()
-            case None =>
-          }
+          weak.get.foreach(_.dispose())
           false
         }
       }
@@ -786,20 +808,6 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
    */
   def copy(): NDArray = copyTo(this.context)
 
-  /**
-   * Return an `NDArray` that lives in the target context. If the array
-   * is already in that context, the same object is returned. Otherwise, a copy is made.
-   * @param context The target context we want the return value to live in.
-   * @return A copy or `self` as an `NDArray` that lives in the target context.
-   */
-  def asInContext(context: Context): NDArray = {
-    if (this.context == context) {
-      this
-    } else {
-      this.copyTo(context)
-    }
-  }
-
   /**
    * Get shape of current NDArray.
    * @return an array representing shape of current ndarray
@@ -815,6 +823,16 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
   // Get size of current NDArray.
   def size: Int = shape.product
 
+  /**
+   * Return an `NDArray` that lives in the target context. If the array
+   * is already in that context, `self` is returned. Otherwise, a copy is made.
+   * @param context The target context we want the return value to live in.
+   * @return A copy or `self` as an `NDArray` that lives in the target context.
+   */
+  def asInContext(context: Context): NDArray = {
+    if (this.context == context) this else this.copyTo(context)
+  }
+
   override def equals(o: Any): Boolean = o match {
     case that: NDArray =>
       that != null && that.shape == this.shape && that.toArray.sameElements(this.toArray)
@@ -828,13 +846,13 @@ class NDArray private[mxnet](private[mxnet] val handle: NDArrayHandle,
 }
 // scalastyle:on finalize
 
-object NDArrayConversions {
+private[mxnet] object NDArrayConversions {
   implicit def int2Scalar(x: Int): NDArrayConversions = new NDArrayConversions(x.toFloat)
   implicit def double2Scalar(x: Double): NDArrayConversions = new NDArrayConversions(x.toFloat)
   implicit def float2Scalar(x: Float): NDArrayConversions = new NDArrayConversions(x)
 }
 
-class NDArrayConversions(val value: Float) {
+private[mxnet] class NDArrayConversions(val value: Float) {
   def +(other: NDArray): NDArray = {
     other + value
   }
@@ -864,9 +882,9 @@ class NDArrayConversions(val value: Float) {
   }
 }
 
-case class NDArrayFunction(handle: NDArrayHandle, arguments: List[String])
+private case class NDArrayFunction(handle: NDArrayHandle, arguments: List[String])
 
-class NDArrayFuncReturn(private[mxnet] val arr: Array[NDArray]) {
+private[mxnet] class NDArrayFuncReturn(private[mxnet] val arr: Array[NDArray]) {
   def head: NDArray = apply(0)
   def get: NDArray = {
     require(arr.length == 1, s"return array length = ${arr.length}")
@@ -916,9 +934,10 @@ class NDArrayFuncReturn(private[mxnet] val arr: Array[NDArray]) {
   def copy(): NDArray = head.copy()
   def shape: Shape = head.shape
   def size: Int = head.size
+  def asInContext(context: Context): NDArray = head.asInContext(context)
 }
 
-class NDArrayInternal private[mxnet](private val internal: Array[Byte], private val dtype: DType) {
+private[mxnet] class NDArrayInternal (private val internal: Array[Byte], private val dtype: DType) {
   private val unitSize = DType.numOfBytes(dtype)
   require(internal.length > 0 && internal.length % unitSize == 0,
     s"$dtype size $unitSize cannot divide byte array size ${internal.length}")
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NameManager.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NameManager.scala
index 0457d80c9cc8..63e0ca0b6d30 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/NameManager.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/NameManager.scala
@@ -22,7 +22,6 @@ import scala.collection.mutable
 /**
  * NameManager to do automatic naming.
  * User can also inherit this object to change naming behavior.
- * @author Yizhi Liu
  */
 class NameManager {
   val counter: mutable.Map[String, Int] = mutable.HashMap.empty[String, Int]
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Operator.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Operator.scala
index 60a75e573d4a..b8332c6974de 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Operator.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Operator.scala
@@ -18,11 +18,11 @@
 package ml.dmlc.mxnet
 
 import ml.dmlc.mxnet.Base._
+import ml.dmlc.mxnet.DType.DType
 import scala.collection.mutable.ArrayBuffer
 
 /**
  * Base class for operators implemented in Scala
- * @author Depeng Liang
  */
 abstract class CustomOp {
 
@@ -140,20 +140,20 @@ abstract class CustomOpProp(needTopGrad: Boolean = false) {
 
   /**
    * inferShape interface. override to create new operators
-   * @param inShape : array of array
+   * @param inShape : array of Shape
    *           list of argument shapes in the same order as declared in listArguments().
    * @return
-   * inShapes : array of array
+   * inShapes : array of Shape
    *            array of argument shapes. Can be modified from inShape.
-   * outShapes : array of array
+   * outShapes : array of Shape
    *            array of output shapes calculated from inShape,
    *            in the same order as declared in listOutputs().
-   * auxShapes : array of array
-   *            array of aux shapes calculated from in_shape,
+   * auxShapes : Optional, array of Shape
+   *            array of aux shapes calculated from inShape,
    *            in the same order as declared in listAuxiliaryStates().
    */
   def inferShape(inShape: Array[Shape]):
-    (Array[Shape], Array[Shape], Array[Shape])
+    (Array[Shape], Array[Shape], Array[Shape]) = (inShape, inShape.take(1), null)
 
   /**
    * Scala Callback for CustomOp::InferShape
@@ -176,13 +176,53 @@ abstract class CustomOpProp(needTopGrad: Boolean = false) {
     } else inShapes.map(_.toArray) ++ outShapes.map(_.toArray)
   }
 
+  /**
+   * inferType interface. override to create new operators
+   * @param inType : array of DType
+   *           list of argument types in the same order as declared in listArguments().
+   * @return
+   * inTypes : array of DType
+   *            array of argument types. Can be modified from inType.
+   * outTypes : array of DType
+   *            array of output types calculated from inType,
+   *            in the same order as declared in listOutputs().
+   * auxTypes : Optional, array of DType
+   *            array of aux types calculated from inType,
+   *            in the same order as declared in listAuxiliaryStates().
+   */
+  def inferType(inType: Array[DType]):
+    (Array[DType], Array[DType], Array[DType]) =
+    (inType, Array.fill[DType](this.listOutputs.length)(inType(0)),
+      Array.fill[DType](this.listAuxiliaryStates.length)(inType(0)))
+
+  /**
+   * Scala Callback for CustomOp::InferType
+   */
+  private[mxnet] def inferTypeEntry(
+    numTensor: Int, intputTypes: Array[Int]): Array[Int] = {
+    val nIn = this.listArguments().length
+    val nOut = this.listOutputs().length
+    val nAux = {
+      val tmp = this.listAuxiliaryStates()
+      if (tmp == null) 0 else tmp.length
+    }
+    require(numTensor == (nIn + nOut + nAux))
+    val (inTypes, outTypes, auxTypes) =
+      inferType(intputTypes.map(DType(_)))
+    require(inTypes != null && inTypes.length != 0)
+    require(outTypes != null && outTypes.length != 0)
+    if (auxTypes != null && auxTypes.length != 0) {
+      inTypes.map(_.id) ++ outTypes.map(_.id) ++ auxTypes.map(_.id)
+    } else inTypes.map(_.id) ++ outTypes.map(_.id)
+  }
+
   /**
    * listOutputs interface. override to create new operators
    * @return
    * outputs : array of String
    *            list of output blob names.
    */
-  def listOutputs(): Array[String]
+  def listOutputs(): Array[String] = Array("output")
 
   /**
    * listArguments interface. override to create new operators
@@ -190,7 +230,7 @@ abstract class CustomOpProp(needTopGrad: Boolean = false) {
    * arguments : array of String
    *            list of argument blob names.
    */
-  def listArguments(): Array[String]
+  def listArguments(): Array[String] = Array("data")
 
   /**
    * listAuxiliaryStates interface. override to create new operators
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
index f83981757e8d..128fc9b53296 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Optimizer.scala
@@ -17,12 +17,13 @@
 
 package ml.dmlc.mxnet
 
+import java.io._
+
 import scala.collection.mutable
 
 object Optimizer {
   def getUpdater(optimizer: Optimizer): MXKVStoreUpdater = {
-    new MXKVStoreUpdater {
-      val states = new scala.collection.mutable.HashMap[Int, AnyRef]
+    new MXKVStoreUpdater with MXKVStoreCachedStates {
       override def update(index: Int, grad: NDArray, weight: NDArray): Unit = {
         val state =
           if (states.contains(index)) {
@@ -34,10 +35,69 @@ object Optimizer {
           }
         optimizer.update(index, weight, grad, state)
       }
+
       override def dispose(): Unit = {
         states.values.foreach(optimizer.disposeState)
         states.clear()
       }
+
+      override def serializeState(): Array[Byte] = {
+        val bos = new ByteArrayOutputStream()
+        try {
+          val out = new ObjectOutputStream(bos)
+          out.writeInt(states.size)
+          states.foreach { case (k, v) =>
+            if (v != null) {
+              out.writeInt(k)
+              val stateBytes = optimizer.serializeState(v)
+              if (stateBytes == null) {
+                out.writeInt(0)
+              } else {
+                out.writeInt(stateBytes.length)
+                out.write(stateBytes)
+              }
+            }
+          }
+          out.flush()
+          bos.toByteArray
+        } finally {
+          try {
+            bos.close()
+          } catch {
+            case _: Throwable =>
+          }
+        }
+      }
+
+      override def deserializeState(bytes: Array[Byte]): Unit = {
+        val bis = new ByteArrayInputStream(bytes)
+        var in: ObjectInputStream = null
+        try {
+          in = new ObjectInputStream(bis)
+          val size = in.readInt()
+          (0 until size).foreach(_ => {
+            val key = in.readInt()
+            val bytesLength = in.readInt()
+            val value =
+              if (bytesLength > 0) {
+                val bytes = Array.fill[Byte](bytesLength)(0)
+                in.readFully(bytes)
+                optimizer.deserializeState(bytes)
+              } else {
+                null
+              }
+            states.update(key, value)
+          })
+        } finally {
+          try {
+            if (in != null) {
+              in.close()
+            }
+          } catch {
+            case _: Throwable =>
+          }
+        }
+      }
     }
   }
 }
@@ -50,7 +110,7 @@ abstract class Optimizer extends Serializable {
   protected var specialized: Boolean = false
   protected val weightSet: mutable.Set[Int] = mutable.HashSet.empty[Int]
   protected var rescaleGrad: Float = 1
-  protected var symbol: Symbol = null
+  @transient protected var symbol: Symbol = null
   protected var idx2name: Map[Int, String] = null
 
   /**
@@ -71,6 +131,10 @@ abstract class Optimizer extends Serializable {
   // Dispose the state it created
   def disposeState(state: AnyRef): Unit
 
+  def serializeState(state: AnyRef): Array[Byte]
+
+  def deserializeState(bytes: Array[Byte]): AnyRef
+
   // Set individual learning rate scale for parameters
   def setLrScale(lrScale: Map[Int, Float]) {
     this.lrScale = mutable.Map(lrScale.toSeq: _*)
@@ -139,4 +203,22 @@ trait MXKVStoreUpdater {
    */
   def update(key: Int, recv: NDArray, local: NDArray): Unit
   def dispose(): Unit
+  // def serializeState(): Array[Byte]
+  // def deserializeState(bytes: Array[Byte]): Unit
+}
+
+trait MXKVStoreCachedStates {
+  protected val states = new scala.collection.mutable.HashMap[Int, AnyRef]
+
+  /**
+   * Serialize states to byte array
+   * @return serialized states
+   */
+  def serializeState(): Array[Byte]
+
+  /**
+   * Update states with serialized results
+   * @param bytes Generated by serializeState()
+   */
+  def deserializeState(bytes: Array[Byte]): Unit
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Profiler.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Profiler.scala
index 7a54b6acca13..ce1103ac6c6f 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Profiler.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Profiler.scala
@@ -19,9 +19,6 @@ package ml.dmlc.mxnet
 
 import ml.dmlc.mxnet.Base._
 
-/**
- * @author Depeng Liang
- */
 object Profiler {
 
   val mode2Int = Map("symbolic" -> 0, "all" -> 1)
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
index 75523dbb7398..71586d3de7f8 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
@@ -21,7 +21,6 @@ import ml.dmlc.mxnet.Base._
 
 /**
  * Random Number interface of mxnet.
- * @author Yuan Tang
  */
 object Random {
   /**
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala
index b9d34368f0a4..5164ee3f21b8 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/RecordIO.scala
@@ -225,7 +225,7 @@ class MXIndexedRecordIO(idxPath: String, uri: String, flag: MXRecordIO.IOFlag,
   def keys(): Iterable[Any] = this.idx.keys
 }
 
-object MXIndexedRecordIO {
+private object MXIndexedRecordIO {
   sealed trait KeyType
   case object TyepInt extends KeyType
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Rtc.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Rtc.scala
index fa4f808c1603..d88f84c93fac 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Rtc.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Rtc.scala
@@ -23,11 +23,9 @@ import ml.dmlc.mxnet.Base._
  * This class allow you to write cuda kernel in Scala
  * and call them with NDArray.
  *
- * @author Depeng Liang
- *
  * @param name String, name of the kernel.
  * @param inputs Array[(String, NDArray)], array of input names and ndarray.
- * @param inputs Array[(String, NDArray)], array of output names and ndarray.
+ * @param outputs Array[(String, NDArray)], array of output names and ndarray.
  * @param kernel String, the actual kernel code.
  *      Note that this is only the body of the kernel, i.e.
  *      after { and before }. Rtc will decorate the kernel.
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Serializer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Serializer.scala
index 614738b4dc7c..7bba7665a0bb 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Serializer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Serializer.scala
@@ -27,14 +27,13 @@ import scala.reflect.ClassTag
 
 /**
  * Serialize & deserialize Java/Scala [[Serializable]] objects
- * @author Yizhi Liu
  */
-abstract class Serializer {
+private[mxnet] abstract class Serializer {
   def serialize[T: ClassTag](t: T): ByteBuffer
   def deserialize[T: ClassTag](bytes: ByteBuffer): T
 }
 
-object Serializer {
+private[mxnet] object Serializer {
   val UTF8 = Charset.forName("UTF-8")
 
   def getSerializer: Serializer = getSerializer(None)
@@ -58,7 +57,7 @@ object Serializer {
   }
 }
 
-class JavaSerializer extends Serializer {
+private[mxnet] class JavaSerializer extends Serializer {
   override def serialize[T: ClassTag](t: T): ByteBuffer = {
     val bos = new ByteArrayOutputStream()
     val out = new ObjectOutputStream(bos)
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Shape.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Shape.scala
index c7f03f53035c..f7b7370f22b7 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Shape.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Shape.scala
@@ -19,7 +19,6 @@ package ml.dmlc.mxnet
 
 /**
  * Shape of [[NDArray]] or other data
- * @author Yizhi Liu
  */
 class Shape(dims: Traversable[Int]) extends Serializable {
   private val shape = dims.toVector
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala
index e67c583fe4e4..7e841555a1d6 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Symbol.scala
@@ -29,7 +29,6 @@ import scala.collection.mutable.{ArrayBuffer, ListBuffer}
  * WARNING: it is your responsibility to clear this object through dispose().
  * NEVER rely on the GC strategy
  * </b>
- * @author Yizhi Liu
  */
 // scalastyle:off finalize
 class Symbol private(private[mxnet] val handle: SymbolHandle) {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala
index 42a0bfc4c07d..a2f12d4fcc07 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Visualization.scala
@@ -22,9 +22,6 @@ import java.io.File
 import java.io.PrintWriter
 import scala.collection.mutable.ArrayBuffer
 
-/**
- * @author Depeng Liang
- */
 object Visualization {
 
   /**
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
index 16a83f722409..f964772f9a6b 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/MXDataIter.scala
@@ -30,9 +30,9 @@ import scala.collection.mutable.ListBuffer
  * @param handle the handle to the underlying C++ Data Iterator
  */
 // scalastyle:off finalize
-class MXDataIter private[mxnet](private[mxnet] val handle: DataIterHandle,
-                                private val dataName: String = "data",
-                                private val labelName: String = "label") extends DataIter {
+private[mxnet] class MXDataIter(private[mxnet] val handle: DataIterHandle,
+                                dataName: String = "data",
+                                labelName: String = "label") extends DataIter {
   private val logger = LoggerFactory.getLogger(classOf[MXDataIter])
 
   // use currentBatch to implement hasNext
@@ -170,8 +170,7 @@ class MXDataIter private[mxnet](private[mxnet] val handle: DataIterHandle,
 }
 
 // scalastyle:on finalize
-class MXDataPack(val iterName: String,
-                 val params: Map[String, String]) extends DataPack {
+private[mxnet] class MXDataPack(iterName: String, params: Map[String, String]) extends DataPack {
   /**
     * get data iterator
     * @return DataIter
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/NDArrayIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/NDArrayIter.scala
index d08e32d152bb..e9cb86b3160a 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/NDArrayIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/NDArrayIter.scala
@@ -28,14 +28,12 @@ import scala.collection.immutable.ListMap
 /**
  * NDArrayIter object in mxnet. Taking NDArray to get dataiter.
  *
- * @author Zixuan Huang
- *
  * @param data NDArrayIter supports single or multiple data and label.
  * @param label Same as data, but is not fed to the model during testing.
  * @param dataBatchSize Batch Size
  * @param shuffle Whether to shuffle the data
  * @param lastBatchHandle "pad", "discard" or "roll_over". How to handle the last batch
- * @note
+ *
  * This iterator will pad, discard or roll over the last batch if
  * the size of data does not match batch_size. Roll over is intended
  * for training and can cause problems if used for prediction.
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
index efa67ac39a87..3270ba7399e5 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/PrefetchingIter.scala
@@ -27,18 +27,18 @@ import scala.collection.immutable.ListMap
  * Base class for prefetching iterators. Takes one or more DataIters
  * and combine them with prefetching.
  *
- * @author Depeng Liang
- *
  * @param iters list of DataIters
  * @param dataNames
  * @param labelNames
  */
-class PrefetchingIter(val iters: IndexedSeq[DataIter],
-                      val dataNames: IndexedSeq[Map[String, String]] = null,
-                      val labelNames: IndexedSeq[Map[String, String]] = null) extends DataIter {
+class PrefetchingIter(
+    iters: IndexedSeq[DataIter],
+    dataNames: IndexedSeq[Map[String, String]] = null,
+    labelNames: IndexedSeq[Map[String, String]] = null) extends DataIter {
+
   private val logger = LoggerFactory.getLogger(classOf[PrefetchingIter])
 
-  require(iters.length > 0, "Iters length must be greater than 0")
+  require(iters.nonEmpty, "Iters length must be greater than 0")
 
   private val _provideData: ListMap[String, Shape] = {
     if (dataNames == null) {
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/ResizeIter.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/ResizeIter.scala
index 0df1b96e20c9..d3b113f64a70 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/ResizeIter.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/io/ResizeIter.scala
@@ -27,18 +27,17 @@ import scala.collection.immutable.ListMap
 
 /**
  * Resize a DataIter to given number of batches per epoch.
- *  May produce incomplete batch in the middle of an epoch due
- *  to padding from internal iterator.
- *
- * @author Zixuan Huang
+ * May produce incomplete batch in the middle of an epoch due
+ * to padding from internal iterator.
  *
  * @param dataIter Internal data iterator.
  * @param reSize number of batches per epoch to resize to.
  * @param resetInternal whether to reset internal iterator on ResizeIter.reset
  */
-class ResizeIter(val dataIter: DataIter,
-                 val reSize: Int,
-                 val resetInternal: Boolean = true) extends DataIter {
+class ResizeIter(
+    dataIter: DataIter,
+    reSize: Int,
+    resetInternal: Boolean = true) extends DataIter {
 
   private val logger = LoggerFactory.getLogger(classOf[ResizeIter])
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
new file mode 100644
index 000000000000..c1cb91de56f5
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
@@ -0,0 +1,638 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.module
+
+import java.io.IOException
+
+import ml.dmlc.mxnet.optimizer.SGD
+import ml.dmlc.mxnet._
+import org.slf4j.LoggerFactory
+
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * The base class of a modules. A module represents a computation component. The design
+ * purpose of a module is that it abstract a computation "machine", that one can run forward,
+ * backward, update parameters, etc. We aim to make the APIs easy to use, especially in the
+ * case when we need to use imperative API to work with multiple modules (e.g. stochastic
+ * depth network).
+ *
+ * A module has several states:
+ *
+ * - Initial state. Memory is not allocated yet, not ready for computation yet.
+ * - Binded. Shapes for inputs, outputs, and parameters are all known, memory allocated,
+ *   ready for computation.
+ * - Parameter initialized. For modules with parameters, doing computation before initializing
+ *   the parameters might result in undefined outputs.
+ * - Optimizer installed. An optimizer can be installed to a module. After this, the parameters
+ *   of the module can be updated according to the optimizer after gradients are computed
+ *   (forward-backward).
+ *
+ *  In order for a module to interactive with others, a module should be able to report the
+ *  following information in its raw stage (before binded)
+ *
+ *  - `data_names`: list of string indicating the names of required data.
+ *  - `output_names`: list of string indicating the names of required outputs.
+ *
+ *  And also the following richer information after binded:
+ *
+ *  - state information
+ *    - `binded`: `bool`, indicating whether the memory buffers needed for computation
+ *      has been allocated.
+ *    - `forTraining`: whether the module is binded for training (if binded).
+ *    - `paramsInitialized`: `bool`, indicating whether the parameters of this modules
+ *      has been initialized.
+ *    - `optimizerInitialized`: `bool`, indicating whether an optimizer is defined
+ *      and initialized.
+ *    - `inputsNeedGrad`: `bool`, indicating whether gradients with respect to the
+ *      input data is needed. Might be useful when implementing composition of modules.
+ *
+ *  - input/output information
+ *    - `dataShapes`: a list of `(name, shape)`. In theory, since the memory is allocated,
+ *      we could directly provide the data arrays. But in the case of data parallelization,
+ *      the data arrays might not be of the same shape as viewed from the external world.
+ *    - `labelShapes`: a list of `(name, shape)`. This might be `[]` if the module does
+ *      not need labels (e.g. it does not contains a loss function at the top), or a module
+ *      is not binded for training.
+ *    - `outputShapes`: a list of `(name, shape)` for outputs of the module.
+ *
+ *  - parameters (for modules with parameters)
+ *    - `getParams()`: return a tuple `(argParams, auxParams)`. Each of those
+ *      is a dictionary of name to `NDArray` mapping. Those `NDArray` always lives on
+ *      CPU. The actual parameters used for computing might live on other devices (GPUs),
+ *      this function will retrieve (a copy of) the latest parameters. Therefore, modifying
+ *    - `setParams(argParams, auxParams)`: assign parameters to the devices
+ *      doing the computation.
+ *    - `initParams(...)`: a more flexible interface to assign or initialize the parameters.
+ *
+ *  - setup
+ *    - `bind()`: prepare environment for computation.
+ *    - `initOptimizer()`: install optimizer for parameter updating.
+ *
+ *  - computation
+ *    - `forward(dataBatch)`: forward operation.
+ *    - `backward(outGrads=None)`: backward operation.
+ *    - `update()`: update parameters according to installed optimizer.
+ *    - `getOutputs()`: get outputs of the previous forward operation.
+ *    - `getInputGrads()`: get the gradients with respect to the inputs computed
+ *      in the previous backward operation.
+ *    - `updateMetric(metric, labels)`: update performance metric for the previous forward
+ *      computed results.
+ *
+ *  - other properties (mostly for backward compatibility)
+ *    - `symbol`: the underlying symbolic graph for this module (if any)
+ *      This property is not necessarily constant. For example, for `BucketingModule`,
+ *      this property is simply the *current* symbol being used. For other modules,
+ *      this value might not be well defined.
+ *
+ *  When those intermediate-level API are implemented properly, the following
+ *  high-level API will be automatically available for a module:
+ *
+ *  - `fit`: train the module parameters on a data set
+ *  - `predict`: run prediction on a data set and collect outputs
+ *  - `score`: run prediction on a data set and evaluate performance
+ */
+abstract class BaseModule {
+  private val logger = LoggerFactory.getLogger(classOf[BaseModule])
+
+  private[module] var binded: Boolean = false
+  private[module] var forTraining: Boolean = false
+  private[module] var inputsNeedGrad: Boolean = false
+  private[module] var paramsInitialized: Boolean = false
+  private[module] var optimizerInitialized: Boolean = false
+  private[module] var symbol: Symbol = null
+  private[module] var execGroup: DataParallelExecutorGroup = null
+  private[module] var argParams: Map[String, NDArray] = null
+  private[module] var auxParams: Map[String, NDArray] = null
+
+  // High Level API
+
+  // A convenient function that calls both `forward` and `backward`.
+  def forwardBackward(dataBatch: DataBatch): Unit = {
+    forward(dataBatch, isTrain = Option(true))
+    backward()
+  }
+
+  /**
+   * Run prediction on `eval_data` and evaluate the performance according to `eval_metric`.
+   * @param evalData : DataIter
+   * @param evalMetric : EvalMetric
+   * @param numBatch Number of batches to run. Default is `Integer.MAX_VALUE`,
+   *                 indicating run until the `DataIter` finishes.
+   * @param batchEndCallback Could also be a list of functions.
+   * @param reset Default `True`,
+   *              indicating whether we should reset `eval_data` before starting evaluating.
+   * @param epoch Default 0. For compatibility, this will be passed to callbacks (if any).
+   *              During training, this will correspond to the training epoch number.
+   */
+  def score(evalData: DataIter, evalMetric: EvalMetric,
+            numBatch: Int = Integer.MAX_VALUE,
+            batchEndCallback: Option[BatchEndCallback] = None,
+            scoreEndCallback: Option[BatchEndCallback] = None,
+            reset: Boolean = true, epoch: Int = 0): EvalMetric = {
+    require(evalData != null && evalMetric != null)
+    require(binded && paramsInitialized)
+
+    if (reset) {
+      evalData.reset()
+    }
+
+    evalMetric.reset()
+
+    var nBatch = 0
+    while (evalData.hasNext && nBatch < numBatch) {
+      val evalBatch = evalData.next()
+
+      forward(evalBatch, isTrain = Option(false))
+      updateMetric(evalMetric, evalBatch.label)
+
+      batchEndCallback.foreach(callback => {
+        callback.invoke(epoch, nBatch, evalMetric)
+      })
+      nBatch += 1
+    }
+
+    scoreEndCallback.foreach(callback => {
+      callback.invoke(epoch, nBatch, evalMetric)
+    })
+
+    evalMetric
+  }
+
+  /**
+   * Run prediction and collect the outputs.
+   * @param evalData
+   * @param numBatch Default is -1, indicating running all the batches in the data iterator.
+   * @param reset Default is `True`, indicating whether we should reset the data iter before start
+   *              doing prediction.
+   * @return The return value will be a nested list like
+   *         `[[out1_batch1, out2_batch1, ...], [out1_batch2, out2_batch2, ...]]`
+   *         This mode is useful because in some cases (e.g. bucketing),
+   *         the module does not necessarily produce the same number of outputs.
+   */
+  def predictEveryBatch(evalData: DataIter, numBatch: Int = -1, reset: Boolean = true)
+    : IndexedSeq[IndexedSeq[NDArray]] = {
+    require(binded && paramsInitialized)
+    if (reset) {
+      evalData.reset()
+    }
+    val outputList = ArrayBuffer.empty[IndexedSeq[NDArray]]
+
+    var nBatch = 0
+    while (evalData.hasNext && nBatch != numBatch) {
+      val evalBatch = evalData.next()
+      outputList.append(predict(evalBatch))
+      nBatch += 1
+    }
+
+    outputList
+  }
+
+  def predict(batch: DataBatch): IndexedSeq[NDArray] = {
+    require(binded && paramsInitialized)
+    forward(batch, isTrain = Option(false))
+    val pad = batch.pad
+    getOutputsMerged().map(out =>
+      out.slice(0, out.shape(0)-pad).copy()
+    )
+  }
+
+  /**
+   * Run prediction and collect the outputs.
+   * @param evalData
+   * @param numBatch Default is -1, indicating running all the batches in the data iterator.
+   * @param reset Default is `True`, indicating whether we should reset the data iter before start
+   *              doing prediction.
+   * @return The return value will be a list `[out1, out2, out3]`.
+   *         Where each element is concatenation of the outputs for all the mini-batches.
+   */
+  def predict(evalData: DataIter, numBatch: Int = -1, reset: Boolean = true)
+    : IndexedSeq[NDArray] = {
+    val outputBatches = predictEveryBatch(evalData, numBatch, reset)
+    val numOutputs = outputBatches.head.size
+    outputBatches.foreach(out =>
+      require(out.size == numOutputs,
+      "Cannot merge batches, as num of outputs is not the same in mini-batches." +
+      "Maybe bucketing is used?")
+    )
+    outputBatches.map(out => NDArray.concatenate(out))
+  }
+
+  // Symbol information
+  // A list of names for data required by this module.
+  def dataNames: IndexedSeq[String]
+
+  // A list of names for the outputs of this module.
+  def outputNames: IndexedSeq[String]
+
+  // Input/Output information
+  // A list of (name, shape) pairs specifying the data inputs to this module.
+  def dataShapes: IndexedSeq[DataDesc]
+
+  /**
+   * A list of (name, shape) pairs specifying the label inputs to this module.
+   * If this module does not accept labels -- either it is a module without loss
+   * function, or it is not binded for training, then this should return an empty
+   * list `[]`.
+   */
+  def labelShapes: IndexedSeq[DataDesc]
+
+  // A list of (name, shape) pairs specifying the outputs of this module.
+  def outputShapes: IndexedSeq[(String, Shape)]
+
+  // Parameters of a module
+  /**
+   * Get parameters, those are potentially copies of the the actual parameters used
+   * to do computation on the device.
+   * @return `(arg_params, aux_params)`, a pair of dictionary of name to value mapping.
+   */
+  def getParams: (Map[String, NDArray], Map[String, NDArray])
+
+  /**
+   * Initialize the parameters and auxiliary states.
+   * @param initializer : Initializer
+   *         Called to initialize parameters if needed.
+   *     arg_params : dict
+   *         If not None, should be a dictionary of existing arg_params. Initialization
+   *         will be copied from that.
+   *     aux_params : dict
+   *         If not None, should be a dictionary of existing aux_params. Initialization
+   *         will be copied from that.
+   *     allow_missing : bool
+   *         If true, params could contain missing values, and the initializer will be
+   *         called to fill those missing params.
+   *     force_init : bool
+   *         If true, will force re-initialize even if already initialized.
+   */
+  def initParams(initializer: Initializer = new Uniform(0.01f),
+                 argParams: Map[String, NDArray] = null,
+                 auxParams: Map[String, NDArray] = null,
+                 allowMissing: Boolean = false, forceInit: Boolean = false): Unit
+
+  /**
+   * Assign parameter and aux state values.
+   *     arg_params : dict
+   *         Dictionary of name to value (`NDArray`) mapping.
+   *     aux_params : dict
+   *         Dictionary of name to value (`NDArray`) mapping.
+   *     allow_missing : bool
+   *         If true, params could contain missing values, and the initializer will be
+   *         called to fill those missing params.
+   *     force_init : bool
+   *         If true, will force re-initialize even if already initialized.
+   */
+  def setParams(argParams: Map[String, NDArray],
+                auxParams: Map[String, NDArray],
+                allowMissing: Boolean = false,
+                forceInit: Boolean = true): Unit = {
+    initParams(initializer = null, argParams = argParams, auxParams = auxParams,
+      allowMissing = allowMissing, forceInit = forceInit)
+  }
+
+  /**
+   * Save model parameters to file.
+   * @param fname Path to output param file.
+   *
+   */
+  def saveParams(fname: String): Unit = {
+    val (argParams, auxParams) = getParams
+    val saveDict = (
+      argParams.map { case (k, v) => (s"arg:$k", v.asInContext(Context.cpu())) }
+      ++ auxParams.map { case (k, v) => (s"aux:$k", v.asInContext(Context.cpu())) }
+    )
+    NDArray.save(fname, saveDict)
+  }
+
+  /**
+   * Load model parameters from file.
+   * @param fname Path to input param file.
+   * @throws IOException if param file is invalid
+   */
+  @throws(classOf[IOException])
+  def loadParams(fname: String): Unit = {
+    val saveDict = NDArray.load(fname)
+    val argParams = scala.collection.mutable.HashMap.empty[String, NDArray]
+    val auxParams = scala.collection.mutable.HashMap.empty[String, NDArray]
+    (saveDict._1 zip saveDict._2) foreach { case (key, value) =>
+      key.split(":", 2) match {
+        case Array(argType, name) if argType == "arg" => argParams.put(name, value)
+        case Array(argType, name) if argType == "aux" => auxParams.put(name, value)
+        case _ => throw new IOException("Invalid param file " + fname)
+      }
+    }
+    setParams(argParams.toMap, auxParams.toMap)
+  }
+
+  /**
+   *
+   * Train the module parameters.
+   * @param trainData
+   * @param evalData If not `None`, will be used as validation set and evaluate
+   *                 the performance after each epoch.
+   * @param numEpoch Number of epochs to run training.
+   * @param fitParams Extra parameters for training.
+   */
+  def fit(trainData: DataIter, evalData: Option[DataIter] = None, numEpoch: Int = 1,
+          fitParams: FitParams = new FitParams): Unit = {
+    require(fitParams != null)
+    require(numEpoch > 0, "please specify number of epochs")
+    import ml.dmlc.mxnet.DataDesc._
+    bind(dataShapes = trainData.provideData, labelShapes = Option(trainData.provideLabel),
+         forTraining = true, forceRebind = fitParams.forceRebind)
+    fitParams.monitor.foreach(installMonitor)
+    initParams(fitParams.initializer, argParams, auxParams,
+      fitParams.allowMissing, fitParams.forceInit)
+    initOptimizer(fitParams.kvstore, fitParams.optimizer)
+
+    val valMetric = fitParams.validationMetric.getOrElse(fitParams.evalMetric)
+
+    // training loop
+    for (epoch <- fitParams.beginEpoch until numEpoch) {
+      val tic = System.currentTimeMillis
+      fitParams.evalMetric.reset()
+
+      var nBatch = 0
+      while (trainData.hasNext) {
+        val dataBatch = trainData.next()
+
+        fitParams.monitor.foreach(_.tic())
+        forwardBackward(dataBatch)
+        update()
+        updateMetric(fitParams.evalMetric, dataBatch.label)
+        fitParams.monitor.foreach(_.tocPrint())
+
+        fitParams.batchEndCallback.foreach(callback =>
+          callback.invoke(epoch, nBatch, fitParams.evalMetric)
+        )
+
+        nBatch += 1
+      }
+
+      // one epoch of training is finished
+      val (name, value) = fitParams.evalMetric.get
+      logger.info(s"Epoch[$epoch] Train-$name=$value")
+      val toc = System.currentTimeMillis
+      logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
+
+      // sync aux params across devices
+      val (argParamsSync, auxParamsSync) = getParams
+      setParams(argParamsSync, auxParamsSync)
+
+      fitParams.epochEndCallback.foreach(callback =>
+        callback.invoke(epoch, symbol, argParamsSync, auxParamsSync)
+      )
+
+      // evaluation on validation set
+      evalData.foreach(data => {
+        val res = score(data, valMetric,
+          scoreEndCallback = fitParams.evalEndCallback,
+          batchEndCallback = fitParams.evalBatchEndCallback, epoch = epoch)
+        val (name, value) = res.get
+        logger.info(s"Epoch[$epoch] Validation-$name=$value")
+      })
+
+      // end of 1 epoch, reset the data-iter for another epoch
+      trainData.reset()
+    }
+  }
+
+  // Install monitor on all executors
+  def installMonitor(monitor: Monitor): Unit
+
+  // Computations
+  /**
+   * Forward computation.
+   * @param dataBatch Could be anything with similar API implemented.
+   * @param isTrain Default is `None`, which means `isTrain` takes the value of `this.forTraining`.
+   */
+  def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit
+
+  /**
+   * Backward computation.
+   * @param outGrads Gradient on the outputs to be propagated back.
+   *                 This parameter is only needed when bind is called
+   *                 on outputs that are not a loss function.
+   */
+  def backward(outGrads: Array[NDArray] = null): Unit
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[out1, out2]`
+   */
+  def getOutputsMerged(): IndexedSeq[NDArray]
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be collected from multiple devices.
+   *         The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getOutputs(): IndexedSeq[IndexedSeq[NDArray]]
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[grad1, grad2]`
+   */
+  def getInputGradsMerged(): IndexedSeq[NDArray]
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be collected from multiple devices.
+   *         The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getInputGrads(): IndexedSeq[IndexedSeq[NDArray]]
+
+  // Update parameters according to the installed optimizer and the gradients computed
+  // in the previous forward-backward batch.
+  def update(): Unit
+
+  /**
+   * Evaluate and accumulate evaluation metric on outputs of the last forward computation.
+   * @param evalMetric
+   * @param labels Typically `DataBatch.label`.
+   */
+  def updateMetric(evalMetric: EvalMetric, labels: IndexedSeq[NDArray]): Unit
+
+  // module setup
+  /**
+   * Bind the symbols to construct executors.
+   * This is necessary before one can perform computation with the module.
+   * @param dataShapes Typically is `DataIter.provideData`.
+   * @param labelShapes Typically is `DataIter.provideLabel`.
+   * @param forTraining Default is `True`. Whether the executors should be bind for training.
+   * @param inputsNeedGrad  Default is `False`.
+   *                        Whether the gradients to the input data need to be computed.
+   *                        Typically this is not needed.
+   *                        But this might be needed when implementing composition of modules.
+   * @param forceRebind Default is `False`. This function does nothing
+   *                    if the executors are already binded. But with this `True`,
+   *                    the executors will be forced to rebind.
+   * @param sharedModule  Default is `None`. This is used in bucketing. When not `None`,
+   *                      the shared module essentially corresponds to a different bucket
+   *                      -- a module with different symbol but with the same sets of parameters
+   *                      (e.g. unrolled RNNs with different lengths).
+   * @param gradReq Requirement for gradient accumulation (globally).
+   *                Can be 'write', 'add', or 'null' (default to 'write').
+   */
+  def bind(dataShapes: IndexedSeq[DataDesc], labelShapes: Option[IndexedSeq[DataDesc]] = None,
+           forTraining: Boolean = true, inputsNeedGrad: Boolean = false,
+           forceRebind: Boolean = false, sharedModule: Option[BaseModule] = None,
+           gradReq: String = "write"): Unit
+
+  // Install and initialize optimizers.
+  def initOptimizer(kvstore: String = "local", optimizer: Optimizer = new SGD(),
+                    resetOptimizer: Boolean = true, forceInit: Boolean = false): Unit
+}
+
+class FitParams {
+  private[module] var evalMetric: EvalMetric = new Accuracy()
+  private[module] var epochEndCallback: Option[EpochEndCallback] = None
+  private[module] var batchEndCallback: Option[BatchEndCallback] = None
+  private[module] var kvstore: String = "local"
+  private[module] var optimizer: Optimizer = new SGD()
+  private[module] var evalEndCallback: Option[BatchEndCallback] = None
+  private[module] var evalBatchEndCallback: Option[BatchEndCallback] = None
+  private[module] var initializer: Initializer = new Uniform(0.01f)
+  private[module] var argParams: Map[String, NDArray] = null
+  private[module] var auxParams: Map[String, NDArray] = null
+  private[module] var allowMissing: Boolean = false
+  private[module] var forceRebind: Boolean = false
+  private[module] var forceInit: Boolean = false
+  private[module] var beginEpoch: Int = 0
+  private[module] var validationMetric: Option[EvalMetric] = None
+  private[module] var monitor: Option[Monitor] = None
+
+  // The performance measure used to display during training.
+  def setEvalMetric(evalMetric: EvalMetric): FitParams = {
+    require(evalMetric != null)
+    this.evalMetric = evalMetric
+    this
+  }
+
+  // Each callback will be called with the current
+  // `epoch`, `symbol`, `arg_params` and `aux_params`.
+  def setEpochEndCallback(epochEndCallback: EpochEndCallback): FitParams = {
+    this.epochEndCallback = Option(epochEndCallback)
+    this
+  }
+
+  // Each callback will be called with a `BatchEndParam`.
+  def setBatchEndCallback(batchEndCallback: BatchEndCallback): FitParams = {
+    this.batchEndCallback = Option(batchEndCallback)
+    this
+  }
+
+  def setKVStore(kvStore: String): FitParams = {
+    require(kvStore != null)
+    this.kvstore = kvstore
+    this
+  }
+
+  def setOptimizer(optimizer: Optimizer): FitParams = {
+    require(optimizer != null)
+    this.optimizer = optimizer
+    this
+  }
+
+  // These will be called at the end of each full evaluation,
+  // with the metrics over the entire evaluation set.
+  def setEvalEndCallback(evalEndCallback: BatchEndCallback): FitParams = {
+    this.evalEndCallback = Option(evalEndCallback)
+    this
+  }
+
+  // These will be called at the end of each minibatch during evaluation.
+  def setEvalBatchEndCallback(evalBatchEndCallback: BatchEndCallback): FitParams = {
+    this.evalBatchEndCallback = Option(evalBatchEndCallback)
+    this
+  }
+
+  // Will be called to initialize the module parameters if not already initialized.
+  def setInitializer(initializer: Initializer): FitParams = {
+    require(initializer != null)
+    this.initializer = initializer
+    this
+  }
+
+  // Default `None`, if not `None`, should be existing parameters from a trained
+  // model or loaded from a checkpoint (previously saved model). In this case,
+  // the value here will be used to initialize the module parameters,
+  // unless they are already initialized by the user
+  // via a call to `init_params` or `fit`.
+  // `argParams` has higher priority to `initializer`.
+  def setArgParams(argParams: Map[String, NDArray]): FitParams = {
+    this.argParams = argParams
+    this
+  }
+
+  // Default `None`. Similar to `argParams`, except for auxiliary states.
+  def setAuxParams(auxParams: Map[String, NDArray]): FitParams = {
+    this.auxParams = auxParams
+    this
+  }
+
+  // Default `False`. Indicate whether we allow missing parameters
+  // when `arg_params` and `aux_params` are not `None`.
+  // If this is `True`, then the missing parameters will be
+  // initialized via the `initializer`.
+  def setAllowMissing(allowMissing: Boolean): FitParams = {
+    this.allowMissing = allowMissing
+    this
+  }
+
+  // Default `False`. Whether to force rebinding the executors if already binded.
+  def setForceRebind(forceRebind: Boolean): FitParams = {
+    this.forceRebind = forceRebind
+    this
+  }
+
+  // Default `False`. Indicate whether we should force initialization even if the
+  // parameters are already initialized.
+  def setForceInit(forceInit: Boolean): FitParams = {
+    this.forceInit = forceInit
+    this
+  }
+
+  // Default `0`. Indicate the starting epoch. Usually, if we are resuming from a
+  // checkpoint saved at a previous training phase at epoch N,
+  // then we should specify this value as N+1.
+  def setBeginEpoch(beginEpoch: Int): FitParams = {
+    require(beginEpoch >= 0)
+    this.beginEpoch = beginEpoch
+    this
+  }
+
+  def setValidationMetric(metric: EvalMetric): FitParams = {
+    this.validationMetric = Option(metric)
+    this
+  }
+
+  def setMonitor(monitor: Monitor): FitParams = {
+    this.monitor = Option(monitor)
+    this
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala
new file mode 100644
index 000000000000..15200828f1b4
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala
@@ -0,0 +1,698 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.module
+
+import ml.dmlc.mxnet.DType.DType
+import ml.dmlc.mxnet._
+import ml.dmlc.mxnet.module.DataParallelExecutorGroup.Builder
+import org.slf4j.{Logger, LoggerFactory}
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+private object DataParallelExecutorGroup {
+  private val logger: Logger = LoggerFactory.getLogger(classOf[DataParallelExecutorGroup])
+  // Load a list of arrays into a list of arrays specified by slices
+  private def loadGeneralMulti(data: Seq[NDArray],
+                               targets: Seq[Array[((Int, Int), NDArray)]],
+                               majorAxis: Seq[Int]): Unit = {
+    for (((dSrc, dTargets), axis) <- data zip targets zip majorAxis) {
+      for (((sliceIdxStart, sliceIdxStop), dDst) <- dTargets) {
+        if (axis >= 0) {
+          // copy slice
+          val shape = dSrc.shape
+          val begin = Array.fill(shape.length)(0)
+          val end = shape.toArray
+          begin(axis) = sliceIdxStart
+          end(axis) = sliceIdxStop
+          if (dSrc.context == dDst.context) {
+            NDArray.crop(Map(
+              "begin" -> new Shape(begin),
+              "end" -> new Shape(end),
+              "out" -> dDst))(dSrc)
+          } else {
+            // on different device, crop and then do cross device copy
+            val dDstCopy: NDArray = NDArray.crop(Map(
+              "begin" -> new Shape(begin),
+              "end" -> new Shape(end)))(dSrc)
+            dDstCopy.copyTo(dDst)
+          }
+        } else {
+          dSrc.copyTo(dDst)
+        }
+      }
+    }
+  }
+
+  private def loadGeneral(data: Seq[NDArray], targets: Seq[NDArray]): Unit = {
+    for ((dSrc, dTarget) <- data zip targets) {
+      dSrc.copyTo(dTarget)
+    }
+  }
+
+  // Load data into sliced arrays
+  private def loadData(batch: DataBatch,
+                       targets: Seq[Array[((Int, Int), NDArray)]],
+                       majorAxis: Seq[Int]): Unit = {
+    loadGeneralMulti(batch.data, targets, majorAxis)
+  }
+
+
+  // Load label into sliced arrays
+  private def loadLabel(batch: DataBatch,
+                        targets: Seq[Array[((Int, Int), NDArray)]],
+                        majorAxis: Seq[Int]): Unit = {
+    loadGeneralMulti(batch.label, targets, majorAxis)
+  }
+
+  // Merge outputs that lives on multiple context into one,
+  // so that they look like living on one context.
+  private def mergeMultiContext(outputs: IndexedSeq[IndexedSeq[NDArray]], majorAxis: Seq[Int])
+    : IndexedSeq[NDArray] = {
+    (outputs zip majorAxis).map { case (tensors, axis) =>
+      if (axis >= 0) {
+        NDArray.concatenate(tensors, axis = axis, alwaysCopy = false)
+      } else {
+        // negative axis means the there is no batch_size axis, and all the
+        // results should be the same on each device. We simply take the first one,
+        // without checking they are actually the same
+        tensors(0)
+      }
+    }
+  }
+
+  private object Builder {
+    private[module] def convertGradReq(
+        gradReq: String, argNames: IndexedSeq[String], paramNames: IndexedSeq[String],
+        fixedParamNames: Set[String], dataNames: Seq[String], inputsNeedGrad: Boolean)
+        : Map[String, String] = {
+      require(argNames != null)
+      require(paramNames != null)
+      require(fixedParamNames != null)
+      require(dataNames != null)
+      argNames.map(k => {
+        if (paramNames.contains(k)) {
+          (k, if (fixedParamNames.contains(k)) "null" else gradReq)
+        } else if (dataNames.contains(k)) {
+          (k, if (inputsNeedGrad) gradReq else "null")
+        } else {
+          (k, "null")
+        }
+      }).toMap
+    }
+  }
+
+  class Builder private[module](private val symbol: Symbol,
+                                private val contexts: Array[Context],
+                                private val paramNames: IndexedSeq[String]) {
+
+    private var workLoadList: IndexedSeq[Float] = null
+    private var dataShapes: IndexedSeq[DataDesc] = null
+    private var labelShapes: Option[IndexedSeq[DataDesc]] = None
+    private var forTraining: Boolean = true
+    private var inputsNeedGrad: Boolean = false
+    private var sharedGroup: Option[DataParallelExecutorGroup] = None
+    private var inputTypes: Option[Map[String, DType]] = None
+    private var fixedParamNames: Set[String] = Set.empty[String]
+    private var gradReqs: Map[String, String] = null
+
+    val argNames = symbol.listArguments()
+
+    def setWorkLoadList(workLoad: IndexedSeq[Float]): Builder = {
+      this.workLoadList = workLoad
+      this
+    }
+
+    def setDataShapes(shapes: IndexedSeq[DataDesc]): Builder = {
+      require(shapes != null)
+      this.dataShapes = shapes
+      this
+    }
+
+    def setDataShapesByName(shapes: IndexedSeq[(String, Shape)]): Builder = {
+      require(shapes != null)
+      this.dataShapes = shapes.map { case (k, s) => new DataDesc(k, s) }
+      this
+    }
+
+    def setLabelShapes(shapes: IndexedSeq[DataDesc]): Builder = {
+      this.labelShapes = Option(shapes)
+      this
+    }
+
+    def setLabelShapesByName(shapes: IndexedSeq[(String, Shape)]): Builder = {
+      this.labelShapes = Option(shapes).map(shapesInst =>
+        shapesInst.map { case (k, s) => new DataDesc(k, s) }
+      )
+      this
+    }
+
+    def setForTraining(forTraining: Boolean): Builder = {
+      this.forTraining = forTraining
+      this
+    }
+
+    def setInputsNeedGrad(needGrad: Boolean): Builder = {
+      this.inputsNeedGrad = needGrad
+      this
+    }
+
+    def setSharedGroup(sharedGroup: DataParallelExecutorGroup): Builder = {
+      this.sharedGroup = Option(sharedGroup)
+      this
+    }
+
+    def setInputTypes(inputTypes: Map[String, DType]): Builder = {
+      this.inputTypes = Option(inputTypes)
+      this
+    }
+
+    def setFixedParamNames(fixedParamNames: Set[String]): Builder = {
+      this.fixedParamNames = Option(fixedParamNames).getOrElse(Set.empty[String])
+      this
+    }
+
+    def setGradReq(gradReq: Map[String, String]): Builder = {
+      require(dataShapes != null)
+      val gradReqTmp = mutable.HashMap.empty[String, String]
+      val dataNames = dataShapes.map(_.name)
+      for (k <- argNames) {
+        if (paramNames.contains(k)) {
+          gradReqTmp.put(k, if (fixedParamNames.contains(k)) "null" else "write")
+        } else if (dataNames.contains(k)) {
+          gradReqTmp.put(k, if (inputsNeedGrad) "write" else "null")
+        } else {
+          gradReqTmp.put(k, "null")
+          gradReqTmp ++= gradReq
+        }
+      }
+      this.gradReqs = gradReqTmp.toMap
+      this
+    }
+
+    def setGradReq(gradReq: String): Builder = {
+      require(dataShapes != null)
+      val dataNames = dataShapes.map(_.name)
+      this.gradReqs = Builder.convertGradReq(
+        gradReq, argNames, paramNames, fixedParamNames, dataNames, inputsNeedGrad)
+      this
+    }
+
+    def setGradReq(gradReq: Seq[(String, String)]): Builder = {
+      require(gradReq.size == argNames.size)
+      this.gradReqs = gradReq.toMap
+      this
+    }
+
+    def build(): DataParallelExecutorGroup = {
+      new DataParallelExecutorGroup(
+        symbol, contexts, workLoadList, dataShapes, labelShapes, paramNames, forTraining,
+        inputsNeedGrad, sharedGroup, inputTypes, fixedParamNames, this.gradReqs)
+    }
+  }
+}
+
+/**
+ * DataParallelExecutorGroup is a group of executors that lives on a group of devices.
+ * This is a helper class used to implement data parallelism. Each mini-batch will
+ * be split and run on the devices.
+ * @param symbol The common symbolic computation graph for all executors.
+ * @param contexts A list of contexts.
+ * @param workLoadList If not `None`, could be a list of numbers that
+ *                     specify the workload to be assigned to different context.
+ *                     Larger number indicate heavier workload.
+ * @param dataShapes Should be a list of (name, shape) tuples, for the shapes of data.
+ *                   Note the order is important and should be the same as the order that
+ *                   the `DataIter` provide the data.
+ * @param labelShapes Should be a list of (name, shape) tuples, for the shapes of label.
+ *                    Note the order is important and should be the same as the order that
+ *                    the `DataIter` provide the label.
+ * @param paramNames A list of strings, indicating the names of parameters
+ *                   (e.g. weights, filters, etc.) in the computation graph.
+ * @param forTraining Indicate whether the executors should be bind for training.
+ *                    When not doing training, the memory for gradients will not be allocated.
+ * @param inputsNeedGrad Indicate whether the gradients for the input data should be computed.
+ *                       This is currently not used.
+ *                       It will be useful for implementing composition of modules.
+ * @param sharedGroup Default is `None`. This is used in bucketing. When not `None`,
+ *                    it should be a executor group corresponding to a different bucket.
+ *                    In other words, it will correspond to a different symbol but
+ *                    with the same set of parameters (e.g. unrolled RNNs with different lengths).
+ *                    In this case, many memory will be shared.
+ * @param inputTypes Default is `None`. When not `None`,
+ *                   can be used to specify the data type for each of the data/label inputs.
+ * @param fixedParamNames Indicate parameters to be fixed during training.
+ *                        Parameters in this list will not allocate space for gradient,
+ *                        nor do gradient calculation.
+ * @param gradReq Requirement for gradient accumulation. Can be 'write', 'add', or 'null',
+ *                be specified for each argument.
+ */
+class DataParallelExecutorGroup private[module](
+    symbol: Symbol,
+    contexts: Array[Context],
+    workLoadList: IndexedSeq[Float],
+    dataShapes: IndexedSeq[DataDesc],
+    labelShapes: Option[IndexedSeq[DataDesc]] = None,
+    private[module] val paramNames: IndexedSeq[String],
+    forTraining: Boolean,
+    inputsNeedGrad: Boolean,
+    sharedGroup: Option[DataParallelExecutorGroup] = None,
+    inputTypes: Option[Map[String, DType]] = None,
+    fixedParamNames: Set[String] = Set.empty[String],
+    gradReq: Map[String, String] = null) {
+
+  require(symbol != null)
+  require(contexts != null)
+
+  private val argNames = symbol.listArguments()
+  private val auxNames = symbol.listAuxiliaryStates()
+
+  private val gradReqRun =
+    if (!forTraining) {
+      val dataNames = dataShapes.map(_.name)
+      Builder.convertGradReq("null",
+        argNames, paramNames, fixedParamNames, dataNames, inputsNeedGrad)
+    } else {
+      gradReq
+    }
+
+  private val sharedDataArrays: Array[mutable.Map[String, NDArray]] =
+    sharedGroup.map(_.sharedDataArrays).getOrElse(
+    Array.fill(contexts.length)(mutable.Map.empty[String, NDArray]))
+
+  private var batchSize: Int = -1
+  private var slices: Array[(Int, Int)] = null
+  private var execs: Array[Executor] = null
+  private var dataArrays: Seq[Array[((Int, Int), NDArray)]] = null
+  private var labelArrays: Option[Seq[Array[((Int, Int), NDArray)]]] = None
+  private[module] var paramArrays: IndexedSeq[Array[NDArray]] = null
+  private[module] var gradArrays: IndexedSeq[Array[NDArray]] = null
+  private[module] var auxArrays: IndexedSeq[Array[NDArray]] = null
+  private var inputGradArrays: IndexedSeq[Array[NDArray]] = null
+
+  private val dataLayouts = decideSlices(dataShapes)
+  private val labelLayouts =
+    // call it to make sure labels has the same batch size as data
+    if (labelShapes != None) decideSlices(labelShapes.get)
+    else null
+
+  private val outputLayouts = symbol.listOutputs().map(name =>
+    DataDesc.getBatchAxis(symbol.get(name).attr("__layout__"))
+  )
+  bindExec(dataShapes, labelShapes, sharedGroup)
+
+  def getBatchSize: Int = batchSize
+
+  /**
+   * Decide the slices for each context according to the workload.
+   * @param dataShapes list of DataDesc(name, shape) specifying
+   *                   the shapes for the input data or label.
+   */
+  private def decideSlices(dataShapes: Seq[DataDesc]): Seq[Int] = {
+    require(dataShapes.size > 0)
+    val majorAxis = dataShapes.map(data => DataDesc.getBatchAxis(Option(data.layout)))
+
+    for ((dataDesc, axis) <- dataShapes.zip(majorAxis)) {
+      if (axis != -1) {
+        val batchSize = dataDesc.shape(axis)
+        if (this.batchSize != -1) {
+          require(batchSize == this.batchSize,
+            s"all data must have the same batch size: $batchSize," +
+            s"but ${dataDesc.name} has shape ${dataDesc.shape}")
+        } else {
+          this.batchSize = batchSize
+          require(this.workLoadList != null)
+          this.slices = ExecutorManager.splitInputSlice(this.batchSize, this.workLoadList)
+        }
+      }
+    }
+    majorAxis
+  }
+
+  /**
+   * Bind executors on their respective devices.
+   * @param dataShapes DataDesc for input data.
+   * @param labelShapes DataDesc for input labels.
+   * @param sharedGroup
+   */
+  def bindExec(dataShapes: Seq[DataDesc], labelShapes: Option[Seq[DataDesc]],
+               sharedGroup: Option[DataParallelExecutorGroup]): Unit = {
+    execs = (0 until contexts.length).map(i =>
+      bindIthExec(i, dataShapes, labelShapes, sharedGroup)
+    ).toArray
+
+    // convenient data structures
+    dataArrays = dataShapes.map(dataDesc =>
+      this.execs.zipWithIndex.map { case (e, i) => (this.slices(i), e.argDict(dataDesc.name)) }
+    )
+
+    labelArrays = labelShapes.map(shapes =>
+      shapes.map(labelDesc =>
+        this.execs.zipWithIndex.map { case (e, i) => (this.slices(i), e.argDict(labelDesc.name)) }
+      )
+    )
+
+    paramArrays = argNames.zipWithIndex.withFilter {
+      case (name, i) => paramNames.contains(name)
+    }.map { case (name, i) =>
+      execs.map(_.argArrays(i))
+    }
+
+    gradArrays =
+      if (forTraining) {
+        argNames.zipWithIndex.withFilter {
+          case (name, i) => paramNames.contains(name)
+        }.map { case (name, i) =>
+          execs.map(_.gradArrays(i))
+        }
+      } else {
+        null
+      }
+
+    val dataNames = dataShapes.map(_.name)
+    inputGradArrays =
+      if (inputsNeedGrad) {
+        argNames.zipWithIndex.withFilter {
+          case (name, i) => dataNames.contains(name)
+        }.map { case (name, i) =>
+          execs.map(_.gradArrays(i))
+        }
+      } else {
+        null
+      }
+
+    auxArrays = (0 until auxNames.length).map(i => execs.map(_.auxArrays(i)))
+  }
+
+  /**
+   * Assign, i.e. copy parameters to all the executors.
+   * @param argParams A dictionary of name to `NDArray` parameter mapping.
+   * @param auxParams A dictionary of name to `NDArray` auxiliary variable mapping.
+   */
+  def setParams(argParams: Map[String, NDArray], auxParams: Map[String, NDArray]): Unit = {
+    execs.foreach(_.copyParamsFrom(argParams, auxParams))
+  }
+
+  /**
+   * Copy data from each executor to `arg_params` and `aux_params`.
+   * @param argParams target parameter arrays
+   * @param auxParams target aux arrays
+   * Note this function will inplace update the NDArrays in arg_params and aux_params.
+   */
+  def getParams(argParams: Map[String, NDArray], auxParams: Map[String, NDArray]): Unit = {
+    for ((name, block) <- paramNames.zip(paramArrays)) {
+      val weight = (block.map(_.copyTo(Context.cpu())).reduce((a: NDArray, b: NDArray) =>
+        (a + b).disposeDeps()
+      ) / block.length).disposeDeps()
+      val weightNewType = weight.asType(argParams(name).dtype)
+      weightNewType.copyTo(argParams(name))
+      weight.dispose()
+      weightNewType.dispose()
+    }
+    for ((name, block) <- auxNames.zip(auxArrays)) {
+      val weight = (block.map(_.copyTo(Context.cpu())).reduce((a: NDArray, b: NDArray) =>
+        (a + b).disposeDeps()
+      ) / block.length).disposeDeps()
+      val weightNewType = weight.asType(auxParams(name).dtype)
+      weightNewType.copyTo(auxParams(name))
+      weight.dispose()
+      weightNewType.dispose()
+    }
+  }
+
+  /**
+   * Split `dataBatch` according to workload and run forward on each devices.
+   * @param dataBatch
+   * @param isTrain The hint for the backend, indicating whether we are during training phase.
+   *                Default is `None`, then the value `self.for_training` will be used.
+   */
+  def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit = {
+    DataParallelExecutorGroup.loadData(dataBatch, dataArrays, dataLayouts)
+    val isTrainOpt = isTrain.getOrElse(this.forTraining)
+    labelArrays.foreach(labels => {
+      require(!isTrainOpt || dataBatch.label != null)
+      if (dataBatch.label != null) {
+        require(labelLayouts != null)
+        DataParallelExecutorGroup.loadLabel(dataBatch, labels, labelLayouts)
+      }
+    })
+    execs.foreach(_.forward(isTrainOpt))
+  }
+
+  // Get the shapes of the outputs.
+  def getOutputShapes: IndexedSeq[(String, Shape)] = {
+    val outputs = execs(0).outputs
+    val shapes = outputs.map(_.shape)
+    (symbol.listOutputs() zip shapes zip outputLayouts) map { case ((key, theShape), axis) =>
+      val shape = theShape.toArray
+      if (axis >= 0) {
+        shape(axis) = batchSize
+      }
+      (key, Shape(shape))
+    }
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be collected from multiple devices.
+   *         The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getOutputs(): IndexedSeq[IndexedSeq[NDArray]] = {
+    (0 until execs(0).outputs.length).map(i => execs.map(_.outputs(i)).toIndexedSeq)
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[out1, out2]`
+   */
+  def getOutputsMerged(): IndexedSeq[NDArray] = {
+    DataParallelExecutorGroup.mergeMultiContext(getOutputs(), outputLayouts)
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be collected from multiple devices.
+   *         The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getInputGrads(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(inputsNeedGrad)
+    inputGradArrays.map(_.toIndexedSeq)
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[grad1, grad2]`
+   */
+  def getInputGradsMerged(): IndexedSeq[NDArray] = {
+    DataParallelExecutorGroup.mergeMultiContext(getInputGrads(), dataLayouts)
+  }
+
+  /**
+   * Run backward on all devices. A backward should be called after
+   * a call to the forward function. Backward cannot be called unless
+   * `this.for_training` is `True`.
+   * @param outGrads Gradient on the outputs to be propagated back.
+   *                 This parameter is only needed when bind is called
+   *                 on outputs that are not a loss function.
+   */
+  def backward(outGrads: Array[NDArray] = null): Unit = {
+    require(forTraining, "re-bind with forTraining = true to run backward")
+
+    for (((exec, islice), i) <- (execs zip slices).zipWithIndex) {
+      val outGradsSlice =
+        if (outGrads != null) {
+          (outGrads zip outputLayouts).map { case (grad, axis) =>
+            if (axis >= 0) {
+              val ogMySlice: NDArray = NDArray.slice_axis(
+                Map("axis" -> axis, "begin" -> islice._1, "end" -> islice._2))(grad)
+              ogMySlice.asInContext(contexts(i))
+            } else {
+              grad.copyTo(contexts(i))
+            }
+          }
+        } else {
+          Array.empty[NDArray]
+        }
+      exec.backward(outGrads = outGradsSlice)
+    }
+  }
+
+  /**
+   * Accumulate the performance according to `eval_metric` on all devices.
+   * @param evalMetric The metric used for evaluation.
+   * @param labels Typically comes from `label` of a `DataBatch`.
+   */
+  def updateMetric(evalMetric: EvalMetric, labels: IndexedSeq[NDArray]): Unit = {
+    for ((texec, islice) <- this.execs zip this.slices) {
+      val labelsSlice =
+        (labels zip this.labelLayouts) map { case (label, axis) =>
+          if (axis == 0) {
+            label.slice(islice)
+          } else if (axis > 0) {
+            val labelMySlice: NDArray = NDArray.slice_axis(Map(
+              "axis" -> axis, "begin" -> islice._1, "end" -> islice._2))(label)
+              .asInContext(label.context)
+            labelMySlice
+          } else {
+            label
+          }
+        }
+      evalMetric.update(labelsSlice, texec.outputs)
+    }
+  }
+
+  // Internal utility function to bind the i-th executor.
+  private def bindIthExec(i: Int, dataShapes: Seq[DataDesc],
+                          labelShapes: Option[Seq[DataDesc]],
+                          sharedGroup: Option[DataParallelExecutorGroup]): Executor = {
+    val dataShapesSliced = slicedShape(dataShapes, i, dataLayouts)
+    val labelShapesSliced = labelShapes.map(slicedShape(_, i, labelLayouts))
+    val sharedExec = sharedGroup.map(_.execs(i))
+    val context = contexts(i)
+    val sharedDataArrays = this.sharedDataArrays(i)
+
+    val inputShapes
+      = dataShapesSliced.toMap ++ labelShapesSliced.getOrElse(Map.empty[String, Shape])
+
+    val (argShapes, _, auxShapes) = symbol.inferShape(inputShapes)
+    require(argShapes != null, "shape inference failed")
+
+    val inputTypesGot = inputTypes.getOrElse(inputShapes.map { case (k, v) =>
+      (k, Base.MX_REAL_TYPE)
+    })
+    val (argTypes, _, auxTypes) = symbol.inferType(inputTypesGot)
+    require(argTypes != null, "type inference failed")
+
+    val argArrays = ArrayBuffer.empty[NDArray]
+    val gradArrayMap = if (forTraining) mutable.HashMap.empty[String, NDArray] else null
+
+    // create or borrow arguments and gradients
+    for (j <- 0 until argNames.length) {
+      val name = argNames(j)
+      val argArr =
+        if (paramNames.contains(name)) {
+          // model parameter
+          sharedExec match {
+            case None =>
+              val argArr = NDArray.zeros(argShapes(j), context, dtype = argTypes(j))
+              if (gradReqRun(name) != "null") {
+                val gradArr = NDArray.zeros(argShapes(j), context, dtype = argTypes(j))
+                gradArrayMap.put(name, gradArr)
+              }
+              argArr
+            case Some(sharedExecInst) =>
+              val argArr = sharedExecInst.argDict(name)
+              require(argArr.shape == argShapes(j))
+              require(argArr.dtype == argTypes(j))
+              if (gradReqRun(name) != "null") {
+                gradArrayMap.put(name, sharedExecInst.gradDict(name))
+              }
+              argArr
+          }
+        } else {
+          // data or label
+          val argArr = getOrReshape(name, sharedDataArrays, argShapes(j), argTypes(j), context)
+          // data might also need grad if inputs_need_grad is True
+          if (gradReqRun(name) != "null") {
+            gradArrayMap.put(name,
+              getOrReshape(s"grad of $name", sharedDataArrays, argShapes(j), argTypes(j), context))
+          }
+          argArr
+        }
+      argArrays.append(argArr)
+    }
+
+    // create or borrow aux variables
+    val auxArrays =
+      sharedExec match {
+        case None => (auxShapes zip auxTypes).map { case (s, t) =>
+          NDArray.zeros(s, context, dtype = t)
+        }.toArray
+        case Some(sharedExecInst) =>
+          for ((arr, j) <- sharedExecInst.auxArrays.zipWithIndex) {
+            require(auxShapes(j) == arr.shape)
+            require(auxTypes(j) == arr.dtype)
+          }
+          sharedExecInst.auxArrays.map(identity)
+      }
+    symbol.bind(ctx = context, args = argArrays.toSeq, argsGrad = gradArrayMap.toMap,
+      gradsReq = gradReqRun, auxStates = auxArrays.toSeq, group2ctx = null,
+      sharedExec = sharedExec.orNull)
+  }
+
+  /**
+   * Get the sliced shapes for the i-th executor.
+   * @param shapes : The original (name, shape) pairs.
+   * @param i Which executor we are dealing with.
+   * @param majorAxis
+   */
+  private def slicedShape(shapes: Seq[DataDesc], i: Int, majorAxis: Seq[Int])
+    : Seq[(String, Shape)] = {
+    (shapes zip majorAxis).map { case (DataDesc(k, shape, _ , _), axis) =>
+      val shapeArr = shape.toArray
+      if (axis >= 0) {
+        shapeArr(axis) = slices(i)._2 - slices(i)._1
+      }
+      (k, Shape(shapeArr))
+    }
+  }
+
+  // Install monitor on all executors
+  def installMonitor(monitor: Monitor): Unit = {
+    execs.foreach(monitor.install)
+  }
+
+  // Internal helper to get a memory block or re-use by re-shaping
+  private def getOrReshape(name: String,
+                           sharedDataArrays: mutable.Map[String, NDArray],
+                           argShape: Shape,
+                           argType: DType,
+                           context: Context): NDArray = {
+    if (sharedDataArrays.contains(name)) {
+      val argArr = sharedDataArrays(name)
+      if (argArr.shape.product >= argShape.product) {
+        // nice, we can directly re-use this data blob
+        require(argArr.dtype == argType)
+        argArr.reshape(argShape)
+      } else {
+        DataParallelExecutorGroup.logger.warn(s"bucketing: data $name has a shape $argShape," +
+          s"which is larger than already allocated shape ${argArr.shape}." +
+          "Need to re-allocate. Consider putting default_bucket_key to be the bucket" +
+          "taking the largest input for better memory sharing.")
+        val argArrNew = NDArray.zeros(argShape, context, dtype = argType)
+        // replace existing shared array because the new one is bigger
+        sharedDataArrays.put(name, argArrNew)
+        argArrNew
+      }
+    } else {
+      val argArrNew = NDArray.zeros(argShape, context, dtype = argType)
+      sharedDataArrays.put(name, argArrNew)
+      argArrNew
+    }
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
new file mode 100644
index 000000000000..f0b8da0ecfcb
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
@@ -0,0 +1,545 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.module
+
+import java.io.{FileInputStream, BufferedInputStream, BufferedOutputStream, FileOutputStream}
+import ml.dmlc.mxnet.DType.DType
+import ml.dmlc.mxnet._
+import ml.dmlc.mxnet.module.DataParallelExecutorGroup.Builder
+import ml.dmlc.mxnet.optimizer.SGD
+import org.slf4j.LoggerFactory
+
+/**
+ * Module is a basic module that wrap a `Symbol`. It is functionally the same
+ * as the `FeedForward` model, except under the module API.
+ * @param symbolVar : Symbol definition.
+ * @param dataNames Input data names.
+ * @param labelNames Input label names
+ * @param contexts Default is cpu().
+ * @param workLoadList  Default `None`, indicating uniform workload.
+ * @param fixedParamNames Default `None`, indicating no network parameters are fixed.
+ */
+class Module(symbolVar: Symbol,
+             val dataNames: IndexedSeq[String] = IndexedSeq("data"),
+             labelNames: IndexedSeq[String] = IndexedSeq("softmax_label"),
+             contexts: Array[Context] = Context.cpu(),
+             workLoadList: Option[IndexedSeq[Float]] = None,
+             fixedParamNames: Option[Set[String]] = None) extends BaseModule {
+  private val logger = LoggerFactory.getLogger(classOf[Module])
+
+  require(symbolVar != null)
+  this.symbol = symbolVar
+
+  private val workLoads = workLoadList.getOrElse(contexts.map(_ => 1f).toIndexedSeq)
+  require(workLoads.size == contexts.length)
+
+  private val labelNameList = if (labelNames == null) IndexedSeq.empty[String] else labelNames
+
+  private val argNames = symbol.listArguments()
+  private val inputNames = dataNames ++ labelNameList
+  private val paramNames = argNames.filterNot(inputNames.toSet)
+  private val auxNames = symbol.listAuxiliaryStates()
+  private val outputNamesVar = symbol.listOutputs()
+
+  private var paramsDirty = false
+
+  private var optimizer: Optimizer = null
+  private var kvstore: Option[KVStore] = None
+  private var updateOnKVStore: Boolean = false
+  private var updater: Option[MXKVStoreUpdater] = None
+  private var preloadOptStates: Option[String] = None
+
+  private var dataShapesVar: IndexedSeq[DataDesc] = null
+  private var labelShapesVar: Option[IndexedSeq[DataDesc]] = None
+
+  override def dataShapes: IndexedSeq[DataDesc] = {
+    require(binded)
+    dataShapesVar
+  }
+
+  override def labelShapes: IndexedSeq[DataDesc] = {
+    require(binded)
+    labelShapesVar.orNull
+  }
+
+  override def outputShapes: IndexedSeq[(String, Shape)] = {
+    require(binded)
+    execGroup.getOutputShapes
+  }
+
+  def outputNames: IndexedSeq[String] = outputNamesVar
+
+  /**
+   * Get current parameters.
+   * `(arg_params, aux_params)`, each a dictionary of name to parameters (in
+   * `NDArray`) mapping.
+   */
+  override def getParams: (Map[String, NDArray], Map[String, NDArray]) = {
+    require(binded && paramsInitialized)
+    if (paramsDirty) {
+      syncParamsFromDevices()
+    }
+    (argParams, auxParams)
+  }
+
+  /**
+   * Initialize the parameters and auxiliary states.
+   * @param initializer Called to initialize parameters if needed.
+   * @param argParams If not None, should be a dictionary of existing arg_params.
+   *                  Initialization will be copied from that.
+   * @param auxParams If not None, should be a dictionary of existing aux_params.
+   *                  Initialization will be copied from that.
+   * @param allowMissing If true, params could contain missing values,
+   *                     and the initializer will be called to fill those missing params.
+   * @param forceInit If true, will force re-initialize even if already initialized.
+   */
+  override def initParams(initializer: Initializer = new Uniform(0.01f),
+                          argParams: Map[String, NDArray] = null,
+                          auxParams: Map[String, NDArray] = null,
+                          allowMissing: Boolean = false, forceInit: Boolean = false): Unit = {
+    if (paramsInitialized && !forceInit) {
+      return
+    }
+    require(binded, "call bind before initializing the parameters")
+
+    if (this.argParams == null) {
+      val paramArrays =
+        execGroup.paramArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
+      this.argParams = this.paramNames.zip(paramArrays).toMap
+    }
+
+    if (this.auxParams == null) {
+      val auxArrays =
+        execGroup.auxArrays.map(nds => NDArray.zeros(nds(0).shape, dtype = nds(0).dtype))
+      this.auxParams = this.auxNames.zip(auxArrays).toMap
+    }
+
+    this.argParams.foreach { case (name, arr) =>
+      impl(name, arr, allowMissing, Option(initializer), argParams)
+    }
+
+    this.auxParams.foreach { case (name, arr) =>
+      impl(name, arr, allowMissing, Option(initializer), auxParams)
+    }
+
+    this.paramsInitialized = true
+    this.paramsDirty = false
+
+    // copy the initialized parameters to devices
+    this.execGroup.setParams(this.argParams, this.auxParams)
+  }
+
+  // Internal helper for parameter initialization
+  private def impl(name: String, arr: NDArray, allowMissing: Boolean,
+                   initializer: Option[Initializer] = None,
+                   cache: Map[String, NDArray] = null): Unit = {
+    if (cache != null) {
+      if (cache.contains(name)) {
+        val cacheArr = cache(name) // just in case the cached array is just the target itself
+        if (cacheArr ne arr) {
+          cacheArr.copyTo(arr)
+        }
+      } else {
+        require(allowMissing, s"$name is not presented")
+        initializer.foreach(inst => inst(name, arr))
+      }
+    } else {
+      initializer.foreach(inst => inst(name, arr))
+    }
+  }
+
+  // Internal function to reset binded state.
+  private def resetBind(): Unit = {
+    binded = false
+    execGroup = null
+    dataShapesVar = null
+    labelShapesVar = None
+  }
+
+  /**
+   * Bind the symbols to construct executors. This is necessary before one
+   * can perform computation with the module.
+   * @param dataShapes Typically is `dataIter.provideData`.
+   * @param labelShapes Typically is `data_iter.provide_label`.
+   * @param forTraining Default is `true`. Whether the executors should be bind for training.
+   * @param inputsNeedGrad Default is `false`.
+   *                       Whether the gradients to the input data need to be computed.
+   *                       Typically this is not needed.
+   *                       But this might be needed when implementing composition of modules.
+   * @param forceRebind Default is `false`.
+   *                    This function does nothing if the executors are already binded.
+   *                    But with this `true`, the executors will be forced to rebind.
+   * @param sharedModule Default is `None`. This is used in bucketing.
+   *                     When not `None`, the shared module essentially corresponds to
+   *                     a different bucket -- a module with different symbol
+   *                     but with the same sets of parameters
+   *                     (e.g. unrolled RNNs with different lengths).
+   */
+  override def bind(dataShapes: IndexedSeq[DataDesc],
+                    labelShapes: Option[IndexedSeq[DataDesc]] = None,
+                    forTraining: Boolean = true, inputsNeedGrad: Boolean = false,
+                    forceRebind: Boolean = false, sharedModule: Option[BaseModule] = None,
+                    gradReq: String = "write"): Unit = {
+    // force rebinding is typically used when one want to switch from training to prediction phase.
+    if (forceRebind) {
+      resetBind()
+    }
+
+    if (binded) {
+      logger.warn("Already binded, ignoring bind()")
+      return
+    }
+
+    this.forTraining = forTraining
+    this.inputsNeedGrad = inputsNeedGrad
+    this.binded = true
+
+    if (!forTraining) {
+      require(!inputsNeedGrad)
+    } else {
+      // this is not True, as some module might not contains a loss function
+      // that consumes the labels
+      // require(labelShapes != None)
+    }
+
+    this.dataShapesVar = dataShapes
+    this.labelShapesVar = labelShapes
+
+    val sharedGroup =
+      sharedModule.map(sharedModuleInst => {
+        require(sharedModuleInst.binded && sharedModuleInst.paramsInitialized)
+        sharedModuleInst.execGroup
+      })
+
+    val inputTypes = this.dataShapesVar.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap ++
+      labelShapes.map(shapes => shapes.map(dataDesc => (dataDesc.name, dataDesc.dtype)).toMap)
+                 .getOrElse(Map.empty[String, DType])
+
+    execGroup = new Builder(symbol, contexts, paramNames)
+      .setWorkLoadList(workLoads)
+      .setDataShapes(dataShapes)
+      .setLabelShapes(labelShapes.orNull)
+      .setForTraining(forTraining)
+      .setInputsNeedGrad(inputsNeedGrad)
+      .setSharedGroup(sharedGroup.orNull)
+      .setFixedParamNames(fixedParamNames.orNull)
+      .setGradReq(gradReq)
+      .setInputTypes(inputTypes)
+      .build()
+
+    if (sharedModule.isDefined) {
+      paramsInitialized = true
+      argParams = sharedModule.get.argParams
+      auxParams = sharedModule.get.auxParams
+    } else if (paramsInitialized) {
+      // if the parameters are already initialized, we are re-binding
+      // so automatically copy the already initialized params
+      execGroup.setParams(argParams, auxParams)
+    }
+
+    sharedModule.foreach {
+      case sharedModuleInst: Module =>
+        if (sharedModuleInst.optimizerInitialized) {
+          borrowOptimizer(sharedModuleInst)
+        }
+      case _ =>
+    }
+  }
+
+  /**
+   * Install and initialize optimizers.
+   * @param kvstore
+   * @param optimizer
+   * @param resetOptimizer Default `True`, indicating whether we should set `rescaleGrad`
+   *                       & `idx2name` for optimizer according to executorGroup
+   * @param forceInit Default `False`, indicating whether we should force re-initializing
+   *                  the optimizer in the case an optimizer is already installed.
+   */
+  def initOptimizer(kvstore: String = "local", optimizer: Optimizer = new SGD(),
+                    resetOptimizer: Boolean = true, forceInit: Boolean = false): Unit = {
+    require(binded && paramsInitialized)
+    if (optimizerInitialized && !forceInit) {
+      logger.warn("optimizer already initialized, ignoring ...")
+    } else {
+      val (kvstoreInst, updateOnKVStore) = Model.createKVStore(kvstore, contexts.length, argParams)
+      val batchSize = execGroup.getBatchSize * (
+        if (kvstoreInst != None && kvstoreInst.get.`type` == "dist_sync") {
+          kvstoreInst.get.numWorkers
+        } else {
+          1
+        })
+      if (resetOptimizer) {
+        val idx2name =
+          if (updateOnKVStore) {
+            execGroup.paramNames.zipWithIndex.map { case (name, i) => (i, name) }.toMap
+          } else {
+            (0 until contexts.length).flatMap(k =>
+              execGroup.paramNames.zipWithIndex.map { case (name, i) =>
+                (i * contexts.length + k, name)
+              }
+            ).toMap
+          }
+        optimizer.setIdx2Name(idx2name)
+        optimizer.setRescaleGrad(1f / batchSize)
+      }
+
+      this.optimizer = optimizer
+      this.kvstore = kvstoreInst
+      this.updateOnKVStore = updateOnKVStore
+
+      kvstoreInst.foreach(kv =>
+        // copy initialized local parameters to kvstore
+        Model.initializeKVStore(kv, execGroup.paramArrays,
+          argParams, paramNames, updateOnKVStore)
+      )
+      updater =
+        if (updateOnKVStore) {
+          kvstoreInst.foreach(_.setOptimizer(this.optimizer))
+          None
+        } else {
+          Some(Optimizer.getUpdater(optimizer))
+        }
+
+      optimizerInitialized = true
+      preloadOptStates.foreach { optStates =>
+        loadOptimizerStates(optStates)
+      }
+      preloadOptStates = None
+    }
+  }
+
+  /**
+   * Borrow optimizer from a shared module. Used in bucketing, where exactly the same
+   * optimizer (esp. kvstore) is used.
+   * @param sharedModule
+   */
+  def borrowOptimizer(sharedModule: Module): Unit = {
+    require(sharedModule.optimizerInitialized)
+    optimizer = sharedModule.optimizer
+    kvstore = sharedModule.kvstore
+    updateOnKVStore = sharedModule.updateOnKVStore
+    updater = sharedModule.updater
+    optimizerInitialized = true
+  }
+
+  /**
+   * Forward computation.
+   * @param dataBatch input data
+   * @param isTrain Default is `None`, which means `is_train` takes the value of `for_training`.
+   */
+  def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit = {
+    require(binded && paramsInitialized)
+    execGroup.forward(dataBatch, isTrain)
+  }
+
+  /**
+   * Backward computation.
+   * @param outGrads Gradient on the outputs to be propagated back.
+   *                 This parameter is only needed when bind is called
+   *                 on outputs that are not a loss function.
+   */
+  def backward(outGrads: Array[NDArray] = null): Unit = {
+    require(binded && paramsInitialized)
+    execGroup.backward(outGrads)
+  }
+
+  // Update parameters according to the installed optimizer and the gradients computed
+  // in the previous forward-backward batch.
+  def update(): Unit = {
+    require(binded && paramsInitialized && optimizerInitialized)
+    paramsDirty = true
+    if (updateOnKVStore) {
+      Model.updateParamsOnKVStore(execGroup.paramArrays,
+        execGroup.gradArrays, kvstore)
+    } else {
+      require(updater != None)
+      Model.updateParams(execGroup.paramArrays,
+        execGroup.gradArrays, updater.orNull, contexts.length, kvstore)
+    }
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be collected from multiple devices.
+   *         The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getOutputs(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(binded && paramsInitialized)
+    execGroup.getOutputs()
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[out1, out2]`
+   */
+  def getOutputsMerged(): IndexedSeq[NDArray] = {
+    require(binded && paramsInitialized)
+    execGroup.getOutputsMerged()
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be collected from multiple devices.
+   *         The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getInputGrads(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(binded && paramsInitialized && inputsNeedGrad)
+    execGroup.getInputGrads()
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[grad1, grad2]`
+   */
+  def getInputGradsMerged(): IndexedSeq[NDArray] = {
+    require(binded && paramsInitialized && inputsNeedGrad)
+    execGroup.getInputGradsMerged()
+  }
+
+  /**
+   * Evaluate and accumulate evaluation metric on outputs of the last forward computation.
+   * @param evalMetric
+   * @param labels
+   */
+  def updateMetric(evalMetric: EvalMetric, labels: IndexedSeq[NDArray]): Unit = {
+    execGroup.updateMetric(evalMetric, labels)
+  }
+
+  // Synchronize parameters from devices to CPU. This function should be called after
+  // calling `update` that updates the parameters on the devices, before one can read the
+  // latest parameters from `self._arg_params` and `self._aux_params`.
+  private def syncParamsFromDevices(): Unit = {
+    execGroup.getParams(argParams, auxParams)
+  }
+
+  // Install monitor on all executors
+  def installMonitor(monitor: Monitor): Unit = {
+    require(binded)
+    execGroup.installMonitor(monitor)
+  }
+
+  /**
+   * Save optimizer (updater) state to file
+   * @param fname Path to output states file.
+   */
+  def saveOptimizerStates(fname: String): Unit = {
+    require(optimizerInitialized, "Optimizer should be initialized before saving.")
+    if (updateOnKVStore) {
+      kvstore.foreach(_.saveOptimizerStates(fname))
+    } else {
+      updater.foreach {
+        case cachedStates: MXKVStoreCachedStates =>
+          val target = new BufferedOutputStream(new FileOutputStream(fname))
+          try {
+            target.write(cachedStates.serializeState())
+          } finally {
+            target.close()
+          }
+        case _ =>
+          logger.warn("Updater does not have states, skip saving to {}", fname)
+      }
+    }
+  }
+
+  /**
+   * Load optimizer (updater) state from file
+   * @param fname Path to input states file.
+   */
+  def loadOptimizerStates(fname: String): Unit = {
+    require(optimizerInitialized, "Optimizer should be initialized before loading.")
+    if (updateOnKVStore) {
+      kvstore.foreach(_.loadOptimizerStates(fname))
+    } else {
+      updater.foreach {
+        case cachedStates: MXKVStoreCachedStates =>
+          val bis = new BufferedInputStream(new FileInputStream(fname))
+          try {
+            val bArray = Stream.continually(bis.read).takeWhile(-1 !=).map(_.toByte).toArray
+            cachedStates.deserializeState(bArray)
+          } finally {
+            bis.close()
+          }
+        case _ =>
+          logger.warn("Updater does not have states, skip loading from {}", fname)
+      }
+    }
+  }
+
+  /**
+   * Save current progress to checkpoint.
+   * Use mx.callback.module_checkpoint as epoch_end_callback to save during training.
+   * @param prefix The file prefix to checkpoint to
+   * @param epoch The current epoch number
+   * @param saveOptStates Whether to save optimizer states for continue training
+   */
+  def saveCheckpoint(prefix: String, epoch: Int, saveOptStates: Boolean = false): Unit = {
+    symbol.save(s"$prefix-symbol.json")
+    val paramName = "%s-%04d.params".format(prefix, epoch)
+    saveParams(paramName)
+    logger.info("Saved checkpoint to {}", paramName)
+    if (saveOptStates) {
+      val stateName = "%s-%04d.states".format(prefix, epoch)
+      saveOptimizerStates(stateName)
+      logger.info("Saved optimizer state to {}", stateName)
+    }
+  }
+}
+
+object Module {
+  /**
+   * Create a model from previously saved checkpoint.
+   * @param prefix Path prefix of saved model files. You should have "prefix-symbol.json",
+   *               "prefix-xxxx.params", and optionally "prefix-xxxx.states",
+   *               where xxxx is the epoch number.
+   * @param epoch Epoch to load.
+   * @param loadOptimizerStates Whether to load optimizer states.
+   *                            Checkpoint needs to have been made with saveOptimizerStates=True
+   * @param dataNames Input data names.
+   * @param labelNames Input label names
+   * @param contexts Default is cpu().
+   * @param workLoadList  Default `None`, indicating uniform workload.
+   * @param fixedParamNames Default `None`, indicating no network parameters are fixed.
+   */
+  def loadCheckpoint(prefix: String, epoch: Int, loadOptimizerStates: Boolean = false,
+                     dataNames: IndexedSeq[String] = IndexedSeq("data"),
+                     labelNames: IndexedSeq[String] = IndexedSeq("softmax_label"),
+                     contexts: Array[Context] = Context.cpu(),
+                     workLoadList: Option[IndexedSeq[Float]] = None,
+                     fixedParamNames: Option[Set[String]] = None): Module = {
+    val (sym, args, auxs) = Model.loadCheckpoint(prefix, epoch)
+    val mod = new Module(symbolVar = sym,
+      dataNames, labelNames, contexts, workLoadList, fixedParamNames)
+    mod.argParams = args
+    mod.auxParams = auxs
+    mod.paramsInitialized = true
+    if (loadOptimizerStates) {
+      mod.preloadOptStates = Some("%s-%04d.states".format(prefix, epoch))
+    }
+    mod
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala
new file mode 100644
index 000000000000..dfa63ebac629
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.module
+
+import ml.dmlc.mxnet._
+import org.slf4j.LoggerFactory
+import scala.collection.mutable.ArrayBuffer
+import ml.dmlc.mxnet.optimizer.SGD
+import scala.collection.immutable.ListMap
+
+/**
+ * A SequentialModule is a container module that can chain multiple modules together.
+ * Note building a computation graph with this kind of imperative container is less
+ * flexible and less efficient than the symbolic graph.
+ * So this should be only used as a handy utility.
+ */
+class SequentialModule extends BaseModule {
+
+  private val logger = LoggerFactory.getLogger(classOf[SequentialModule])
+
+  private val META_TAKE_LABELS = "take_labels"
+  private val META_AUTO_WIRING = "auto_wiring"
+  private val metaKeys = Set(META_TAKE_LABELS, META_AUTO_WIRING)
+
+  private val modules = ArrayBuffer[BaseModule]()
+  private val metas = ArrayBuffer[Map[String, Boolean]]()
+  private var labelShapesVar: Option[IndexedSeq[DataDesc]] = None
+
+  /**
+   * Add a module to the chain.
+   * An example of addinging two modules to a chain:
+   * val seqMod = new SequentialModule()
+   * seqMod.add(mod1).add(mod2)
+   * @param module The new module to add.
+   * @param kwargs All the keyword arguments are saved as meta information
+   *                                for the added module. The currently known meta includes
+   *                                - "take_labels": indicating whether the module expect to
+   *                                take labels when doing computation. Note any module in
+   *                                the chain can take labels (not necessarily only the top
+   *                                most one), and they all take the same labels passed
+   *                                from the original data batch for the `SequentialModule`.
+   * @return This function returns `this` to allow us to easily chain a series of `add` calls.
+   */
+  def add(module: BaseModule, kwargs: (String, Boolean)*): SequentialModule = {
+    this.modules += module
+
+    // a sanity check to avoid typo
+    kwargs.foreach { case (k, v) =>
+      require(this.metaKeys.contains(k), s"Unknown meta $k,auxParams a typo?")
+    }
+
+    this.metas += kwargs.map(kw => kw._1 -> kw._2).toMap
+
+    // after adding new modules, we are reset back to raw states, needs
+    // to bind, init_params, etc.
+    this.binded = false
+    this.paramsInitialized = false
+    this.optimizerInitialized = false
+
+    this
+  }
+
+  /**
+   * @return A list of names for data required by this module.
+   */
+  override def dataNames: IndexedSeq[String] = {
+    if (this.modules.length > 0) this.modules.head.dataNames
+    else IndexedSeq[String]()
+  }
+
+  /**
+   * @return A list of names for the outputs of this module.
+   */
+  override def outputNames: IndexedSeq[String] = {
+    if (this.modules.length > 0) this.modules.reverse.head.outputNames
+    else IndexedSeq[String]()
+  }
+
+  /**
+   * Get data shapes.
+   * @return The data shapes of the first module is the data shape of a SequentialModule.
+   */
+  override def dataShapes: IndexedSeq[DataDesc] = {
+    require(this.binded)
+    this.modules.head.dataShapes
+  }
+
+  /**
+   * Get label shapes.
+   * @return The return value could be null if
+   * the module does not need labels, or if the module is not binded for
+   * training (in this case, label information is not available).
+   */
+  override def labelShapes: IndexedSeq[DataDesc] = {
+    require(this.binded)
+    this.labelShapesVar.orNull
+  }
+
+  /**
+   * Get output shapes.
+   * @return The output shapes of the last
+   * module is the output shape of a SequentialModule.
+   */
+  override def outputShapes: IndexedSeq[(String, Shape)] = {
+    require(this.binded)
+    this.modules.reverse.head.outputShapes
+  }
+
+    /**
+   * Get current parameters.
+   * @return (argParams, auxParams),
+   * each a Map of name to parameters (in NDArray) mapping.
+   */
+  override def getParams: (Map[String, NDArray], Map[String, NDArray]) = {
+    require(this.binded && this.paramsInitialized)
+    ((Map[String, NDArray](), Map[String, NDArray]()) /: this.modules){ (result, module) =>
+      val (arg, aux) = module.getParams
+      (result._1 ++ arg, result._2 ++ aux)
+    }
+  }
+
+  /**
+   * Initialize the parameters and auxiliary states.
+   * @param initializer Called to initialize parameters if needed.
+   * @param argParams If not None, should be a dictionary of existing arg_params.
+   *                  Initialization will be copied from that.
+   * @param auxParams If not None, should be a dictionary of existing aux_params.
+   *                  Initialization will be copied from that.
+   * @param allowMissing If true, params could contain missing values,
+   *                     and the initializer will be called to fill those missing params.
+   * @param forceInit If true, will force re-initialize even if already initialized.
+   */
+  override def initParams(initializer: Initializer = new Uniform(0.01f),
+                          argParams: Map[String, NDArray] = null,
+                          auxParams: Map[String, NDArray] = null,
+                          allowMissing: Boolean = false, forceInit: Boolean = false): Unit = {
+    if (this.paramsInitialized && !forceInit) {
+      return
+    }
+    require(this.binded, "call bind before initializing the parameters")
+
+    for (module <- this.modules) {
+      module.initParams(initializer = initializer, argParams = argParams,
+          auxParams = auxParams, allowMissing = allowMissing, forceInit = forceInit)
+    }
+
+    // Internal function to help checking duplicated names,
+    // make sure we do not have duplicated parameter names.
+    def checkName(knownNames: scala.collection.mutable.Map[String, Int],
+      newNames: Array[String], modules: ArrayBuffer[BaseModule], i: Int): Unit = {
+      for (name <- newNames) {
+        require(!knownNames.contains(name), s"Duplicated parameter names: " +
+            s"name $name in layer $i (${modules(i).getClass.getName}) is already " +
+            s"used in layer ${knownNames("name")}" +
+            s"(${modules(knownNames("name")).getClass.getName})")
+        knownNames(name) = i
+      }
+    }
+
+    val argNames = scala.collection.mutable.Map[String, Int]()
+    val auxNames = scala.collection.mutable.Map[String, Int]()
+    for ((module, iLayer) <- this.modules.zipWithIndex) {
+      val (argParams, auxParams) = module.getParams
+      checkName(argNames, argParams.keys.toArray, this.modules, iLayer)
+      checkName(auxNames, auxParams.keys.toArray, this.modules, iLayer)
+    }
+    this.paramsInitialized = true
+  }
+
+  /**
+   * Bind the symbols to construct executors. This is necessary before one
+   * can perform computation with the module.
+   * @param dataShapes Typically is `dataIter.provideData`.
+   * @param labelShapes Typically is `data_iter.provide_label`.
+   * @param forTraining Default is `true`. Whether the executors should be bind for training.
+   * @param inputsNeedGrad Default is `false`.
+   *                       Whether the gradients to the input data need to be computed.
+   *                       Typically this is not needed.
+   *                       But this might be needed when implementing composition of modules.
+   * @param forceRebind Default is `false`.
+   *                    This function does nothing if the executors are already binded.
+   *                    But with this `true`, the executors will be forced to rebind.
+   * @param sharedModule Default is `None`. This is used in bucketing.
+   *                     When not `None`, the shared module essentially corresponds to
+   *                     a different bucket -- a module with different symbol
+   *                     but with the same sets of parameters
+   *                     (e.g. unrolled RNNs with different lengths).
+   * @param gradReq Requirement for gradient accumulation (globally).
+   *                Can be 'write', 'add', or 'null' (default to 'write').
+   */
+  override def bind(dataShapes: IndexedSeq[DataDesc],
+                    labelShapes: Option[IndexedSeq[DataDesc]] = None,
+                    forTraining: Boolean = true, inputsNeedGrad: Boolean = false,
+                    forceRebind: Boolean = false, sharedModule: Option[BaseModule] = None,
+                    gradReq: String = "write"): Unit = {
+    if (this.binded && !forceRebind) {
+      logger.warn(s"Already binded, ignoring bind()")
+      return
+    }
+
+    if (inputsNeedGrad) {
+      require(forTraining == true)
+    }
+
+    require(sharedModule == None, "Shared module is not supported")
+    require(this.modules.length > 0, "Attempting to bind an empty SequentialModule")
+
+    this.forTraining = forTraining
+    this.inputsNeedGrad = inputsNeedGrad
+    this.binded = true
+
+    // the same label shapes are used for all chained modules
+    this.labelShapesVar = labelShapes
+
+    var myDataShapes = dataShapes
+    var myLabelShapes = labelShapes
+    var anybodyEverNeedsLabel = false
+    for ((module, iLayer) <- this.modules.zipWithIndex) {
+      val meta = this.metas(iLayer)
+      if (meta.contains(META_TAKE_LABELS) && meta(META_TAKE_LABELS)) {
+        myLabelShapes = labelShapes
+        anybodyEverNeedsLabel = true
+      } else myLabelShapes = None
+
+      val myInputsNeedGrad = if (inputsNeedGrad || (forTraining && iLayer > 0)) true else false
+      if (meta.contains(META_AUTO_WIRING) && meta(META_AUTO_WIRING)) {
+        val dataNames = module.dataNames
+        require(dataNames.length == myDataShapes.length)
+        myDataShapes = dataNames.zip(myDataShapes).map { case (newName, dataDes) =>
+          DataDesc(newName, dataDes.shape)
+        }
+      }
+
+      module.bind(myDataShapes, myLabelShapes, forTraining, myInputsNeedGrad,
+          forceRebind, sharedModule = None, gradReq)
+      // the output of the previous module is the data of the next module
+      myDataShapes = module.outputShapes.map{case (name, shape) => DataDesc(name, shape)}
+    }
+
+
+    if (!anybodyEverNeedsLabel) {
+      // then I do not need label either
+      this.labelShapesVar = None
+    }
+  }
+
+  /**
+   * Install and initialize optimizers.
+   * @param kvstore
+   * @param optimizer
+   * @param resetOptimizer Default `True`, indicating whether we should set `rescaleGrad`
+   *                       & `idx2name` for optimizer according to executorGroup
+   * @param forceInit Default `False`, indicating whether we should force re-initializing
+   *                  the optimizer in the case an optimizer is already installed.
+   */
+  override def initOptimizer(kvstore: String = "local", optimizer: Optimizer = new SGD(),
+      resetOptimizer: Boolean = true, forceInit: Boolean = false): Unit = {
+    require(this.binded && this.paramsInitialized)
+    if (optimizerInitialized && !forceInit) {
+      logger.warn("optimizer already initialized, ignoring ...")
+    } else {
+      for (module <- this.modules) {
+        module.initOptimizer(kvstore, optimizer, resetOptimizer, forceInit)
+      }
+    }
+    this.optimizerInitialized = true
+  }
+
+  /**
+   * Forward computation.
+   * @param dataBatch input data
+   * @param isTrain Default is `None`, which means `isTrain` takes the value of `forTraining`.
+   */
+  override def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit = {
+    require(this.binded && this.paramsInitialized)
+
+    var data = dataBatch
+    for ((module, iLayer) <- this.modules.zipWithIndex) {
+      module.forward(data, isTrain = isTrain)
+      // the last layer, do not need to do the followings
+      if (iLayer < this.modules.length - 1) {
+        val out = module.getOutputs()
+        // need to update this, in case the internal module is using bucketing
+        // or whatever
+        val dataNames = module.outputShapes.map(_._1)
+        require(dataNames.length == data.data.length)
+        var provideData = ListMap[String, Shape]()
+        for ((name, x) <- dataNames.zip(out.head)) {
+          provideData += name -> x.shape
+        }
+        data = new DataBatch(out.head, data.label, data.index,
+            data.pad, data.bucketKey, provideData, data.provideLabel)
+      }
+    }
+  }
+
+  /**
+   * Backward computation.
+   * @param outGrads Gradient on the outputs to be propagated back.
+   *                 This parameter is only needed when bind is called
+   *                 on outputs that are not a loss function.
+   */
+  override def backward(outGrads: Array[NDArray] = null): Unit = {
+    require(this.binded && this.paramsInitialized)
+    var grad = outGrads
+    for ((module, iLayer) <- this.modules.zipWithIndex.reverse) {
+      module.backward(outGrads = grad)
+      if (iLayer > 0) {
+        grad = module.getInputGradsMerged().toArray
+      }
+    }
+  }
+
+  // Update parameters according to the installed optimizer and the gradients computed
+  // in the previous forward-backward batch.
+  override def update(): Unit = {
+    require(this.binded && this.paramsInitialized && this.optimizerInitialized)
+    this.modules.foreach(_.update())
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be collected from multiple devices.
+   *         The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getOutputs(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(this.binded && this.paramsInitialized)
+    this.modules.reverse.head.getOutputs()
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[out1, out2]`
+   */
+  def getOutputsMerged(): IndexedSeq[NDArray] = {
+    require(this.binded && this.paramsInitialized)
+    this.modules.reverse.head.getOutputsMerged()
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be collected from multiple devices.
+   *         The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  def getInputGrads(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(this.binded && this.paramsInitialized && inputsNeedGrad)
+    this.modules.head.getInputGrads()
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[grad1, grad2]`
+   */
+  def getInputGradsMerged(): IndexedSeq[NDArray] = {
+    require(this.binded && this.paramsInitialized && inputsNeedGrad)
+    this.modules.head.getInputGradsMerged()
+  }
+
+  /**
+   * Evaluate and accumulate evaluation metric on outputs of the last forward computation.
+   * @param evalMetric
+   * @param labels
+   */
+  def updateMetric(evalMetric: EvalMetric, labels: IndexedSeq[NDArray]): Unit = {
+    require(this.binded && this.paramsInitialized)
+    for ((meta, module) <- this.metas.zip(this.modules)) {
+      if (meta.contains(META_TAKE_LABELS) && meta(META_TAKE_LABELS)) {
+        module.updateMetric(evalMetric, labels)
+      }
+    }
+  }
+
+  // Install monitor on all executors
+  def installMonitor(monitor: Monitor): Unit = {
+    require(this.binded)
+    this.modules.foreach(_.installMonitor(monitor))
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaDelta.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaDelta.scala
index 4f4b1215fa7c..63a78108128c 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaDelta.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaDelta.scala
@@ -17,24 +17,23 @@
 
 package ml.dmlc.mxnet.optimizer
 
-import ml.dmlc.mxnet.{NDArray, Optimizer}
 import ml.dmlc.mxnet.NDArrayConversions._
+import ml.dmlc.mxnet.util.SerializerUtils
+import ml.dmlc.mxnet.{NDArray, Optimizer}
 
 /**
  * AdaDelta optimizer as described in Matthew D. Zeiler, 2012.
  * http://arxiv.org/abs/1212.5701
  *
- * @author Yuan Tang, Yizhi Liu
- *
  * @param rho Decay rate for both squared gradients and delta x.
  * @param epsilon The constant as described in the thesis
  * @param rescaleGradient rescaling factor of gradient.
  * @param clipGradient clip gradient in range [-clip_gradient, clip_gradient]
  * @param wd L2 regularization coefficient add to all the weights
  */
-class AdaDelta(var rho: Float = 0.05f, val rescaleGradient: Float = 1.0f,
-               val epsilon: Float = 1e-8f, val wd: Float = 0.0f,
-               val clipGradient: Float = 0f) extends Optimizer {
+class AdaDelta(rho: Float = 0.05f, rescaleGradient: Float = 1.0f,
+               epsilon: Float = 1e-8f, wd: Float = 0.0f,
+               clipGradient: Float = 0f) extends Optimizer {
 
   /**
    * Update the parameters.
@@ -88,5 +87,25 @@ class AdaDelta(var rho: Float = 0.05f, val rescaleGradient: Float = 1.0f,
       delta.dispose()
     }
   }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      val (g, delta) = state.asInstanceOf[(NDArray, NDArray)]
+      SerializerUtils.serializeNDArrays(g, delta)
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      val ndArrays = SerializerUtils.deserializeNDArrays(bytes)
+      require(ndArrays.size == 2, s"Got ${ndArrays.size} arrays, expected 2.")
+      val state = (ndArrays(0), ndArrays(1))
+      state.asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
 }
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaGrad.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaGrad.scala
index 203c252f3674..759b9468f7d8 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaGrad.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/AdaGrad.scala
@@ -24,16 +24,14 @@ import ml.dmlc.mxnet.{NDArray, Optimizer}
  * AdaGrad optimizer as described in Matthew D. Zeiler, 2012.
  * http://arxiv.org/pdf/1212.5701v1.pdf
  *
- * @author Yuan Tang, Yizhi Liu
- *
  * @param learningRate Step size.
  * @param epsilon A small float number to make the updating processing stable.
  *                Default value is set to 1e-7.
  * @param rescaleGradient rescaling factor of gradient.
  * @param wd L2 regularization coefficient add to all the weights
  */
-class AdaGrad(val learningRate: Float = 0.05f, val rescaleGradient: Float = 1.0f,
-              val epsilon: Float = 1e-7f, val wd: Float = 0.0f) extends Optimizer {
+class AdaGrad(val learningRate: Float = 0.05f, rescaleGradient: Float = 1.0f,
+              epsilon: Float = 1e-7f, wd: Float = 0.0f) extends Optimizer {
 
   /**
    * Update the parameters.
@@ -71,4 +69,20 @@ class AdaGrad(val learningRate: Float = 0.05f, val rescaleGradient: Float = 1.0f
       state.asInstanceOf[NDArray].dispose()
     }
   }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      state.asInstanceOf[NDArray].serialize()
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      NDArray.deserialize(bytes).asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
index 356274634686..10f90ae1e2ff 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/Adam.scala
@@ -17,8 +17,9 @@
 
 package ml.dmlc.mxnet.optimizer
 
-import ml.dmlc.mxnet.{NDArray, Optimizer, LRScheduler}
 import ml.dmlc.mxnet.NDArrayConversions._
+import ml.dmlc.mxnet.util.SerializerUtils
+import ml.dmlc.mxnet.{LRScheduler, NDArray, Optimizer}
 
 /**
  * Adam optimizer as described in [King2014]
@@ -27,8 +28,6 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * Adam: A Method for Stochastic Optimization,
  * http://arxiv.org/abs/1412.6980
  *
- * @author Yuan Tang, Yizhi Liu
- *
  * @param learningRate Float, Step size.
  * @param beta1 Float, Exponential decay rate for the first moment estimates.
  * @param beta2 Float, Exponential decay rate for the second moment estimates.
@@ -38,9 +37,9 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
-class Adam(val learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2: Float = 0.999f,
-           val epsilon: Float = 1e-8f, val decayFactor: Float = 1-1e-8f, val wd: Float = 0.0f,
-           val clipGradient: Float = 0f, val lrScheduler: LRScheduler = null) extends Optimizer {
+class Adam(val learningRate: Float = 0.002f, beta1: Float = 0.9f, beta2: Float = 0.999f,
+           epsilon: Float = 1e-8f, decayFactor: Float = 1-1e-8f, wd: Float = 0.0f,
+           clipGradient: Float = 0f, lrScheduler: LRScheduler = null) extends Optimizer {
 
   protected var time: Int = 0
   protected var timeFirstIndex: Option[Int] = None
@@ -131,4 +130,24 @@ class Adam(val learningRate: Float = 0.002f, val beta1: Float = 0.9f, val beta2:
       variance.dispose()
     }
   }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      val (mean, variance) = state.asInstanceOf[(NDArray, NDArray)]
+      SerializerUtils.serializeNDArrays(mean, variance)
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      val ndArrays = SerializerUtils.deserializeNDArrays(bytes)
+      require(ndArrays.size == 2, s"Got ${ndArrays.size} arrays, expected 2.")
+      val state = (ndArrays(0), ndArrays(1))
+      state.asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/DCASGD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/DCASGD.scala
new file mode 100644
index 000000000000..763c0346482f
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/DCASGD.scala
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.optimizer
+
+import ml.dmlc.mxnet.{Optimizer, LRScheduler, NDArray}
+import ml.dmlc.mxnet.NDArrayConversions._
+import ml.dmlc.mxnet.util.SerializerUtils
+
+/**
+ * DCASGD optimizer with momentum and weight regularization.
+ * Implementation of paper "Asynchronous Stochastic Gradient Descent with
+ * Delay Compensation for Distributed Deep Learning"
+ */
+class DCASGD(val learningRate: Float = 0.01f, momentum: Float = 0.0f,
+      lamda: Float = 0.04f, wd: Float = 0.0f, clipGradient: Float = 0f,
+      lrScheduler: LRScheduler = null) extends Optimizer {
+
+  if (lrScheduler != null) {
+    lrScheduler.baseLR = learningRate
+  }
+
+  /**
+   * Update the parameters.
+   * @param index An unique integer key used to index the parameters
+   * @param weight weight ndarray
+   * @param grad grad ndarray
+   * @param state NDArray or other objects returned by initState
+   *              The auxiliary state used in optimization.
+   */
+  override def update(index: Int, weight: NDArray, grad: NDArray, state: AnyRef): Unit = {
+    val lr =
+      (if (lrScheduler != null) {
+        val scheduledLr = lrScheduler(numUpdate)
+        updateCount(index)
+        scheduledLr
+      } else {
+        this.learningRate
+      }) * lrScale.getOrElse(index, 1f)
+
+    val wd = getWd(index, this.wd)
+    var resdGrad = grad * this.rescaleGrad
+    if (clipGradient != 0f) {
+      // to get rid of memory leak
+      val oldResdGrad = resdGrad
+      resdGrad = NDArray.clip(resdGrad, -clipGradient, clipGradient)
+      oldResdGrad.dispose()
+    }
+
+    var (mon, previousWeight) = state.asInstanceOf[(NDArray, NDArray)]
+
+    val monUpdated = -lr * (resdGrad + wd * weight + this.lamda *
+        resdGrad * resdGrad * (weight - previousWeight))
+    monUpdated.disposeDepsExcept(resdGrad, weight, previousWeight)
+    if (mon != null) {
+      mon *= this.momentum
+      mon += monUpdated
+    } else {
+      require(this.momentum == 0)
+      mon = monUpdated
+    }
+    previousWeight.set(weight)
+    weight += mon
+    resdGrad.dispose()
+  }
+
+  // Create additional optimizer state such as momentum.
+  override def createState(index: Int, weight: NDArray): (NDArray, NDArray) = {
+    if (momentum == 0.0f) {
+      (null, weight.copy())
+    } else {
+      (NDArray.zeros(weight.shape, weight.context, weight.dtype), weight.copy())
+    }
+  }
+
+  // Dispose the state it created
+  override def disposeState(state: AnyRef): Unit = {
+    if (state != null) {
+      val (mon, preWeight) = state.asInstanceOf[(NDArray, NDArray)]
+      if (mon != null) mon.dispose()
+      preWeight.dispose()
+    }
+  }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      val (mon, preWeight) = state.asInstanceOf[(NDArray, NDArray)]
+      if (mon != null) SerializerUtils.serializeNDArrays(mon, preWeight)
+      else preWeight.serialize()
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      val ndArrays = SerializerUtils.deserializeNDArrays(bytes)
+      require(ndArrays.size <= 2, s"Got ${ndArrays.size} arrays, expected <= 2.")
+      val state = {
+        if (ndArrays.length == 1) (null, ndArrays(0))
+        else (ndArrays(0), ndArrays(1))
+      }
+      state.asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala
index 4abe1a4882ee..f1ee4cba637a 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/NAG.scala
@@ -25,17 +25,15 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * It is implemented according to
  * https://github.com/torch/optim/blob/master/sgd.lua
  *
- * @author Depeng Liang
- *
  * @param learningRate Float, Step size.
  * @param momentum Float, momentum value.
  * @param wd Float, L2 regularization coefficient add to all the weights
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
-class NAG(val learningRate: Float = 0.01f, val momentum: Float = 0.0f,
-          val wd: Float = 0.0001f, val clipGradient: Float = 0f,
-          val lrScheduler: LRScheduler = null) extends Optimizer {
+class NAG(val learningRate: Float = 0.01f, momentum: Float = 0.0f,
+          wd: Float = 0.0001f, clipGradient: Float = 0f,
+          lrScheduler: LRScheduler = null) extends Optimizer {
 
   if (lrScheduler != null) {
     lrScheduler.baseLR = learningRate
@@ -105,4 +103,20 @@ class NAG(val learningRate: Float = 0.01f, val momentum: Float = 0.0f,
       state.asInstanceOf[NDArray].dispose()
     }
   }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      state.asInstanceOf[NDArray].serialize()
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      NDArray.deserialize(bytes).asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/RMSProp.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/RMSProp.scala
index 00ba178a561b..a001eb05f496 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/RMSProp.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/RMSProp.scala
@@ -17,6 +17,7 @@
 
 package ml.dmlc.mxnet.optimizer
 
+import ml.dmlc.mxnet.util.SerializerUtils
 import ml.dmlc.mxnet.{NDArray, Optimizer, LRScheduler}
 import ml.dmlc.mxnet.NDArrayConversions._
 
@@ -24,8 +25,6 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * RMSProp optimizer as described in Tieleman & Hinton, 2012.
  * http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
  *
- * @author Yuan Tang, Yizhi Liu
- *
  * @param learningRate Float, Step size.
  * @param gamma1 Float, decay factor of moving average for gradient, gradient^^2.
  * @param gamma2 Float, momentum factor of moving average for gradient.
@@ -34,9 +33,9 @@ import ml.dmlc.mxnet.NDArrayConversions._
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
-class RMSProp(val learningRate: Float = 0.002f, val rescaleGradient: Float = 1.0f,
-              val gamma1: Float = 0.95f, val gamma2: Float = 0.9f, val wd: Float = 0.0f,
-              val lrScheduler: LRScheduler = null, val clipGradient: Float = 0f) extends Optimizer {
+class RMSProp(val learningRate: Float = 0.002f, rescaleGradient: Float = 1.0f,
+              gamma1: Float = 0.95f, gamma2: Float = 0.9f, wd: Float = 0.0f,
+              lrScheduler: LRScheduler = null, clipGradient: Float = 0f) extends Optimizer {
 
   /**
    * Update the parameters.
@@ -93,5 +92,25 @@ class RMSProp(val learningRate: Float = 0.002f, val rescaleGradient: Float = 1.0
       delta.dispose()
     }
   }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      val (n, g, delta) = state.asInstanceOf[(NDArray, NDArray, NDArray)]
+      SerializerUtils.serializeNDArrays(n, g, delta)
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      val ndArrays = SerializerUtils.deserializeNDArrays(bytes)
+      require(ndArrays.size == 3, s"Got ${ndArrays.size} arrays, expected 3.")
+      val state = (ndArrays(0), ndArrays(1), ndArrays(2))
+      state.asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
 }
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
index 3e545c9e05a5..e77d519ca29d 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGD.scala
@@ -22,11 +22,10 @@ import ml.dmlc.mxnet.NDArrayConversions._
 
 /**
  * A very simple SGD optimizer with momentum and weight regularization.
- * @author Yizhi Liu
  */
-class SGD(private val learningRate: Float = 0.01f, private val momentum: Float = 0.0f,
-          private val wd: Float = 0.0001f, private val clipGradient: Float = 0f,
-          private val lrScheduler: LRScheduler = null) extends Optimizer {
+class SGD(val learningRate: Float = 0.01f, momentum: Float = 0.0f,
+          wd: Float = 0.0001f, clipGradient: Float = 0f,
+          lrScheduler: LRScheduler = null) extends Optimizer {
 
   if (lrScheduler != null) {
     lrScheduler.baseLR = learningRate
@@ -100,4 +99,20 @@ class SGD(private val learningRate: Float = 0.01f, private val momentum: Float =
       state.asInstanceOf[NDArray].dispose()
     }
   }
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    if (state != null) {
+      state.asInstanceOf[NDArray].serialize()
+    } else {
+      null
+    }
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    if (bytes != null) {
+      NDArray.deserialize(bytes).asInstanceOf[AnyRef]
+    } else {
+      null
+    }
+  }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala
index c06849e032e6..8a1d8dcecd7c 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/optimizer/SGLD.scala
@@ -24,17 +24,15 @@ import ml.dmlc.mxnet.Random
 /**
  * Stochastic Langevin Dynamics Updater to sample from a distribution.
  *
- * @author Depeng Liang
- *
  * @param learningRate Float, Step size.
  * @param rescaleGradient Float, rescaling factor of gradient.
  * @param wd Float, L2 regularization coefficient add to all the weights
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
-class SGLD(val learningRate: Float = 0.01f, val rescaleGradient: Float = 1.0f,
-           val wd: Float = 0.0001f, val clipGradient: Float = 0f,
-           val lrScheduler: LRScheduler = null) extends Optimizer {
+class SGLD(val learningRate: Float = 0.01f, rescaleGradient: Float = 1.0f,
+           wd: Float = 0.0001f, clipGradient: Float = 0f,
+           lrScheduler: LRScheduler = null) extends Optimizer {
 
   if (lrScheduler != null) {
     lrScheduler.baseLR = learningRate
@@ -84,4 +82,12 @@ class SGLD(val learningRate: Float = 0.01f, val rescaleGradient: Float = 1.0f,
 
   // Dispose the state it created
   override def disposeState(state: AnyRef): Unit = {}
+
+  override def serializeState(state: AnyRef): Array[Byte] = {
+    throw new UnsupportedOperationException("SGLD does not have states")
+  }
+
+  override def deserializeState(bytes: Array[Byte]): AnyRef = {
+    throw new UnsupportedOperationException("SGLD does not have states")
+  }
 }
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/NativeLibraryLoader.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/NativeLibraryLoader.scala
index 9dfb55b4ff93..500c97b7d065 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/NativeLibraryLoader.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/NativeLibraryLoader.scala
@@ -21,8 +21,9 @@ import java.io._
 
 import org.slf4j.{Logger, LoggerFactory}
 
-class NativeLibraryLoader
-object NativeLibraryLoader {
+private[mxnet] class NativeLibraryLoader
+
+private[mxnet] object NativeLibraryLoader {
   private val logger: Logger = LoggerFactory.getLogger(classOf[NativeLibraryLoader])
   private val libPathInJar = "/lib/native/"
   private val _tempDir: File =
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/SerializerUtils.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/SerializerUtils.scala
new file mode 100644
index 000000000000..5850212335e4
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/util/SerializerUtils.scala
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.util
+
+import java.io.{ObjectInputStream, ByteArrayInputStream, ObjectOutputStream, ByteArrayOutputStream}
+
+import ml.dmlc.mxnet.NDArray
+
+private[mxnet] object SerializerUtils {
+  /**
+   * Serialize NDArrays to bytes
+   * @param arrays NDArrays to be serialized
+   * @return serialized bytes
+   */
+  def serializeNDArrays(arrays: NDArray*): Array[Byte] = {
+    val bos = new ByteArrayOutputStream()
+    try {
+      val out = new ObjectOutputStream(bos)
+      out.writeInt(arrays.length)
+      arrays.foreach(array => {
+        val sArray = array.serialize()
+        out.writeInt(sArray.length)
+        out.write(sArray)
+      })
+      out.flush()
+      bos.toByteArray
+    } finally {
+      try {
+        bos.close()
+      } catch {
+        case _: Throwable =>
+      }
+    }
+  }
+
+  /**
+   * Deserialize bytes to a list of NDArrays.
+   * This should be used with SerializerUtils.serializeNDArrays
+   * @param bytes serialized NDArray bytes
+   * @return deserialized NDArrays
+   */
+  def deserializeNDArrays(bytes: Array[Byte]): IndexedSeq[NDArray] = {
+    if (bytes != null) {
+      val bis = new ByteArrayInputStream(bytes)
+      var in: ObjectInputStream = null
+      try {
+        in = new ObjectInputStream(bis)
+        val numArrays = in.readInt()
+        (0 until numArrays).map(_ => {
+          val len = in.readInt()
+          val bytes = Array.fill[Byte](len)(0)
+          in.readFully(bytes)
+          NDArray.deserialize(bytes)
+        })
+      } finally {
+        try {
+          if (in != null) {
+            in.close()
+          }
+        } catch {
+          case _: Throwable =>
+        }
+      }
+    } else {
+      null
+    }
+  }
+}
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
index 9a38e0f92bfb..ff23bf9b2762 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/IOSuite.scala
@@ -23,13 +23,16 @@ import scala.sys.process._
 
 
 class IOSuite extends FunSuite with BeforeAndAfterAll {
+
+  private var tu = new TestUtil
+
   test("test MNISTIter & MNISTPack") {
     // get data
     "./scripts/get_mnist_data.sh" !
 
     val params = Map(
-      "image" -> "data/train-images-idx3-ubyte",
-      "label" -> "data/train-labels-idx1-ubyte",
+      "image" -> tu.dataFile("train-images-idx3-ubyte"),
+      "label" -> tu.dataFile("train-labels-idx1-ubyte"),
       "data_shape" -> "(784,)",
       "batch_size" -> "100",
       "shuffle" -> "1",
@@ -88,8 +91,8 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     "./scripts/get_cifar_data.sh" !
 
     val params = Map(
-      "path_imgrec" -> "data/cifar/train.rec",
-      "mean_img" -> "data/cifar/cifar10_mean.bin",
+      "path_imgrec" -> tu.dataFile("cifar/train.rec"),
+      "mean_img" -> tu.dataFile("cifar/cifar10_mean.bin"),
       "rand_crop" -> "False",
       "and_mirror" -> "False",
       "shuffle" -> "False",
@@ -135,8 +138,8 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     "./scripts/get_mnist_data.sh" !
 
     val params = Map(
-      "image" -> "data/train-images-idx3-ubyte",
-      "label" -> "data/train-labels-idx1-ubyte",
+      "image" -> tu.dataFile("train-images-idx3-ubyte"),
+      "label" -> tu.dataFile("train-labels-idx1-ubyte"),
       "data_shape" -> "(784,)",
       "batch_size" -> "100",
       "shuffle" -> "1",
@@ -172,8 +175,8 @@ class IOSuite extends FunSuite with BeforeAndAfterAll {
     "./scripts/get_mnist_data.sh" !
 
     val params = Map(
-      "image" -> "data/train-images-idx3-ubyte",
-      "label" -> "data/train-labels-idx1-ubyte",
+      "image" -> tu.dataFile("train-images-idx3-ubyte"),
+      "label" -> tu.dataFile("train-labels-idx1-ubyte"),
       "data_shape" -> "(784,)",
       "batch_size" -> "100",
       "shuffle" -> "1",
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
index 00060b305d3c..d3033ddffcf3 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
@@ -283,7 +283,7 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(argmax.toArray === Array(1f, 0f))
   }
 
-  test("concatenate") {
+  test("concatenate axis-0") {
     val arr1 = NDArray.array(Array(1f, 2f, 4f, 3f, 3f, 3f), shape = Shape(2, 3))
     val arr2 = NDArray.array(Array(8f, 7f, 6f), shape = Shape(1, 3))
     val arr = NDArray.concatenate(arr1, arr2)
@@ -291,6 +291,14 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
     assert(arr.toArray === Array(1f, 2f, 4f, 3f, 3f, 3f, 8f, 7f, 6f))
   }
 
+  test("concatenate axis-1") {
+    val arr1 = NDArray.array(Array(1f, 2f, 3f, 4f), shape = Shape(2, 2))
+    val arr2 = NDArray.array(Array(5f, 6f), shape = Shape(2, 1))
+    val arr = NDArray.concatenate(Array(arr1, arr2), axis = 1)
+    assert(arr.shape === Shape(2, 3))
+    assert(arr.toArray === Array(1f, 2f, 5f, 3f, 4f, 6f))
+  }
+
   test("transpose") {
     val arr = NDArray.array(Array(1f, 2f, 4f, 3f, 3f, 3f), shape = Shape(2, 3))
     assert(arr.toArray === Array(1f, 2f, 4f, 3f, 3f, 3f))
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
index 16736927f690..4a2ae75cc4b8 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
@@ -573,7 +573,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     val exeDeconv = deconv.bind(Context.cpu(), args = deconvArgs, argsGrad = deconvArgsGrad)
     val deconvOutGrad = convData
     exeDeconv.backward(deconvOutGrad)
-    assert(reldiff(convArgsGrad(1), deconvArgsGrad(1)) < 1e-6)
+    assert(reldiff(convArgsGrad(1), deconvArgsGrad(1)) < 1e-5)
   }
 
   test("deconvolution gradient") {
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/TestUtil.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/TestUtil.scala
new file mode 100644
index 000000000000..58971c02bb57
--- /dev/null
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/TestUtil.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet
+
+class TestUtil {
+
+  /**
+    * Allow override of data path. Default is <current directory>/data
+    * @return Data direcotry path ()may be relative)
+    */
+  def getDataDirectory: String = {
+    var dataDir = System.getenv("MXNET_DATA_DIR")
+    if(dataDir == null) {
+      dataDir = "data"
+    } else {
+      if (dataDir.isEmpty) {
+        dataDir = "data"
+      }
+    }
+    dataDir
+  }
+
+  /**
+    * Create data file path based upon getDataDirectory
+    * @param relFile
+    * @return file path
+    */
+  def dataFile(relFile: String): String = {
+    getDataDirectory + "/" + relFile
+  }
+
+}
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
index 035825e5cce1..f24553abfa65 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/train/ConvSuite.scala
@@ -28,6 +28,8 @@ import scala.sys.process._
 class ConvSuite extends FunSuite with BeforeAndAfterAll {
   private val logger = LoggerFactory.getLogger(classOf[ConvSuite])
 
+  private var tu = new TestUtil
+
   test("train mnist") {
     // symbol net
     val batchSize = 100
@@ -54,8 +56,8 @@ class ConvSuite extends FunSuite with BeforeAndAfterAll {
     // get data
     "./scripts/get_mnist_data.sh" !
     val trainDataIter = IO.MNISTIter(Map(
-      "image" -> "data/train-images-idx3-ubyte",
-      "label" -> "data/train-labels-idx1-ubyte",
+      "image" -> tu.dataFile("train-images-idx3-ubyte"),
+      "label" -> tu.dataFile("train-labels-idx1-ubyte"),
       "data_shape" -> "(1, 28, 28)",
       "label_name" -> "sm_label",
       "batch_size" -> batchSize.toString,
@@ -65,8 +67,8 @@ class ConvSuite extends FunSuite with BeforeAndAfterAll {
       "seed" -> "10"))
 
     val valDataIter = IO.MNISTIter(Map(
-      "image" -> "data/t10k-images-idx3-ubyte",
-      "label" -> "data/t10k-labels-idx1-ubyte",
+      "image" -> tu.dataFile("t10k-images-idx3-ubyte"),
+      "label" -> tu.dataFile("t10k-labels-idx1-ubyte"),
       "data_shape" -> "(1, 28, 28)",
       "label_name" -> "sm_label",
       "batch_size" -> batchSize.toString,
@@ -108,6 +110,6 @@ class ConvSuite extends FunSuite with BeforeAndAfterAll {
     }
     val acc = numCorrect.toFloat / numInst
     logger.info(s"Final accuracy = $acc")
-    assert(acc > 0.95)
+    assert(acc > 0.92)
   }
 }
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/util/SerializerUtilsSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/util/SerializerUtilsSuite.scala
new file mode 100644
index 000000000000..3b4e395beea5
--- /dev/null
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/util/SerializerUtilsSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.util
+
+import ml.dmlc.mxnet.NDArray
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+
+class SerializerUtilsSuite extends FunSuite with BeforeAndAfterAll {
+  test("serialize & deserialize NDArrays") {
+    val a = NDArray.zeros(2, 3)
+    val b = NDArray.ones(3, 1)
+    val bytes = SerializerUtils.serializeNDArrays(a, b)
+    val ndArrays = SerializerUtils.deserializeNDArrays(bytes)
+    assert(ndArrays.size === 2)
+    assert(ndArrays(0) === a)
+    assert(ndArrays(1) === b)
+  }
+}
diff --git a/scala-package/examples/scripts/customop/run_customop.sh b/scala-package/examples/scripts/customop/run_customop.sh
index 237d15c491ec..44c8ef6d50d9 100644
--- a/scala-package/examples/scripts/customop/run_customop.sh
+++ b/scala-package/examples/scripts/customop/run_customop.sh
@@ -3,9 +3,6 @@
 MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
 CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
 
-# MXNET_CPU_WORKER_NTHREADS must be greater than 1 for custom op to work on CPU
-export MXNET_CPU_WORKER_NTHREADS=2
-
 # which gpu card to use, -1 means cpu
 GPU=$1
 
@@ -14,6 +11,6 @@ GPU=$1
 DATA_PATH=$2
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.customop.ExampleCustomOp \
+	ml.dmlc.mxnetexamples.customop.ExampleCustomOp \
 	--data-path $DATA_PATH \
 	--gpu $GPU \
diff --git a/scala-package/examples/scripts/customop/run_customopwithrtc.sh b/scala-package/examples/scripts/customop/run_customopwithrtc.sh
index 1b89101a0cc6..2d1391054bf4 100644
--- a/scala-package/examples/scripts/customop/run_customopwithrtc.sh
+++ b/scala-package/examples/scripts/customop/run_customopwithrtc.sh
@@ -11,6 +11,6 @@ GPU=0
 DATA_PATH=$1
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.customop.ExampleCustomOpWithRtc \
+	ml.dmlc.mxnetexamples.customop.ExampleCustomOpWithRtc \
 	--data-path $DATA_PATH \
 	--gpu $GPU \
diff --git a/scala-package/examples/scripts/module/mnist_mlp.sh b/scala-package/examples/scripts/module/mnist_mlp.sh
new file mode 100755
index 000000000000..0b450d7608a4
--- /dev/null
+++ b/scala-package/examples/scripts/module/mnist_mlp.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+ROOT_DIR=$(cd `dirname $0`/../../..; pwd)
+CLASSPATH=$ROOT_DIR/assembly/osx-x86_64-cpu/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
+
+mkdir -p model
+java -Xmx4G -cp $CLASSPATH \
+	ml.dmlc.mxnetexamples.module.MnistMlp \
+  --data-dir "$ROOT_DIR/core/data/" \
+  --batch-size 10 \
+  --num-epoch 5
diff --git a/scala-package/examples/scripts/module/run_sequential_module.sh b/scala-package/examples/scripts/module/run_sequential_module.sh
new file mode 100644
index 000000000000..15cc7dda8ba2
--- /dev/null
+++ b/scala-package/examples/scripts/module/run_sequential_module.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+ROOT_DIR=$(cd `dirname $0`/../../..; pwd)
+CLASSPATH=$ROOT_DIR/assembly/linux-x86_64-cpu/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
+
+DATA_DIR=$ROOT_DIR/core/data
+
+SAVE_MODEL_PATH=.
+
+# LOAD_MODEL=seqModule-0001.params
+
+java -Xmx4G -cp $CLASSPATH \
+            ml.dmlc.mxnetexamples.module.SequentialModuleEx \
+            --data-dir $DATA_DIR \
+            --batch-size 10 \
+            --num-epoch 2 \
+            --lr 0.01 \
+            --save-model-path $SAVE_MODEL_PATH \
+            # --load-model-path $LOAD_MODEL
diff --git a/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh b/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh
index 32f20a153078..10bc2da4f9bf 100644
--- a/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh
+++ b/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh
@@ -9,7 +9,7 @@ OUTPUT_DIR=$3
 GPU=0
 
 java -Xmx1024m -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.neuralstyle.end2end.BoostInference \
+	ml.dmlc.mxnetexamples.neuralstyle.end2end.BoostInference \
 	--model-path $MODEL_DIR \
 	--input-image $INPUT_IMG \
 	--output-path $OUTPUT_DIR \
diff --git a/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh b/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh
index 4c1f1dfead9a..3ede06a78b0b 100644
--- a/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh
+++ b/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh
@@ -12,7 +12,7 @@ SAVE_MODEL_DIR=$4
 GPU=0
 
 java -Xmx1024m -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.neuralstyle.end2end.BoostTrain \
+	ml.dmlc.mxnetexamples.neuralstyle.end2end.BoostTrain \
 	--data-path $TRAIN_DATA_PATH  \
 	--vgg--model-path  $VGG_MODEL_PATH \
 	--save--model-path $SAVE_MODEL_DIR \
diff --git a/scala-package/examples/scripts/profiler/run_profiler_matmul.sh b/scala-package/examples/scripts/profiler/run_profiler_matmul.sh
index 3c1b0f35d530..b54a4226fb14 100644
--- a/scala-package/examples/scripts/profiler/run_profiler_matmul.sh
+++ b/scala-package/examples/scripts/profiler/run_profiler_matmul.sh
@@ -12,7 +12,7 @@ OUTPUT_PATH="."
 FILE_NAME="profile_matmul_20iter.json"
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.profiler.ProfilerMatMul \
+	ml.dmlc.mxnetexamples.profiler.ProfilerMatMul \
 	--gpu $GPU \
 	--profiler-mode $MODE \
 	--output-path $OUTPUT_PATH \
diff --git a/scala-package/examples/scripts/profiler/run_profiler_ndarray.sh b/scala-package/examples/scripts/profiler/run_profiler_ndarray.sh
index f6d2ea96ab89..4a849c57b2ee 100644
--- a/scala-package/examples/scripts/profiler/run_profiler_ndarray.sh
+++ b/scala-package/examples/scripts/profiler/run_profiler_ndarray.sh
@@ -10,7 +10,7 @@ OUTPUT_PATH="."
 FILE_NAME="profile_ndarray.json"
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.profiler.ProfilerNDArray \
+	ml.dmlc.mxnetexamples.profiler.ProfilerNDArray \
 	--profiler-mode $MODE \
 	--output-path $OUTPUT_PATH \
 	--profile-filename $FILE_NAME
diff --git a/scala-package/examples/scripts/rnn/run_test_charrnn.sh b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
index da76794cf5b3..781c2ee5d204 100644
--- a/scala-package/examples/scripts/rnn/run_test_charrnn.sh
+++ b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
@@ -4,8 +4,8 @@ MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
 CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
 
 # you can get the training data file using the following command
-# wget http://data.mxnet.io/mxnet/data/lab_data.zip
-# unzip -o lab_data.zip
+# wget http://data.mxnet.io/data/char_lstm.zip
+# unzip -o char_lstm.zip
 # for example ./datas/obama.txt
 DATA_PATH=$1
 # for example ./models/obama
@@ -14,7 +14,7 @@ MODEL_PREFIX=$2
 STARTER_SENTENCE="The joke"
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.rnn.TestCharRnn \
+	ml.dmlc.mxnetexamples.rnn.TestCharRnn \
 	--data-path $DATA_PATH \
 	--model-prefix $MODEL_PREFIX \
 	--starter-sentence "$STARTER_SENTENCE"
diff --git a/scala-package/examples/scripts/rnn/run_train_charrnn.sh b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
index 9ede87955103..019f1e69e06b 100644
--- a/scala-package/examples/scripts/rnn/run_train_charrnn.sh
+++ b/scala-package/examples/scripts/rnn/run_train_charrnn.sh
@@ -6,15 +6,15 @@ CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_R
 # which gpu card to use, -1 means cpu
 GPU=$1
 # you can get the training data file using the following command
-# wget http://data.mxnet.io/mxnet/data/lab_data.zip
-# unzip -o lab_data.zip
+# wget http://data.mxnet.io/data/char_lstm.zip
+# unzip -o char_lstm.zip
 # for example ./datas/obama.txt
 DATA_PATH=$2
 # for example ./models
 SAVE_MODEL_PATH=$3
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.rnn.TrainCharRnn \
+	ml.dmlc.mxnetexamples.rnn.TrainCharRnn \
 	--data-path $DATA_PATH \
 	--save-model-path $SAVE_MODEL_PATH \
 	--gpu $GPU \
diff --git a/scala-package/examples/scripts/run_cnntextclassification.sh b/scala-package/examples/scripts/run_cnntextclassification.sh
index 0b2e735707c7..8ace6ff22c29 100644
--- a/scala-package/examples/scripts/run_cnntextclassification.sh
+++ b/scala-package/examples/scripts/run_cnntextclassification.sh
@@ -15,7 +15,7 @@ BATCH_SIZE=$5
 SAVE_MODEL_PATH=$6
 
 java -Xmx8G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.cnnclassification.CNNTextClassification \
+	ml.dmlc.mxnetexamples.cnnclassification.CNNTextClassification \
 	--gpu $GPU \
 	--mr-dataset-path $MR_DATASET_PATH \
 	--w2v-file-path $W2V_FILE_PATH \
diff --git a/scala-package/examples/scripts/run_gan_mnist.sh b/scala-package/examples/scripts/run_gan_mnist.sh
index b0bd9ad436ad..2d3c545cf5d3 100644
--- a/scala-package/examples/scripts/run_gan_mnist.sh
+++ b/scala-package/examples/scripts/run_gan_mnist.sh
@@ -14,7 +14,7 @@ MNIST_DATA_PATH=$2
 OUTPUT_PATH=$3
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.gan.GanMnist \
+	ml.dmlc.mxnetexamples.gan.GanMnist \
 	--mnist-data-path $MNIST_DATA_PATH \
 	--gpu $GPU \
 	--output-path $OUTPUT_PATH
diff --git a/scala-package/examples/scripts/run_multitask.sh b/scala-package/examples/scripts/run_multitask.sh
index d5d620ca5b6a..1642cc8336f2 100644
--- a/scala-package/examples/scripts/run_multitask.sh
+++ b/scala-package/examples/scripts/run_multitask.sh
@@ -11,6 +11,6 @@ GPU=$1
 DATA_PATH=$2
 
 java -Xmx4G -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.multitask.ExampleMultiTask \
+	ml.dmlc.mxnetexamples.multitask.ExampleMultiTask \
 	--data-path $DATA_PATH \
 	--gpu $GPU \
diff --git a/scala-package/examples/scripts/run_neuralstyle.sh b/scala-package/examples/scripts/run_neuralstyle.sh
index 07f5ffd5c9f2..5fbfc3227b7c 100644
--- a/scala-package/examples/scripts/run_neuralstyle.sh
+++ b/scala-package/examples/scripts/run_neuralstyle.sh
@@ -8,7 +8,7 @@ MODEL_PATH=$MXNET_ROOT/example/neural-style/model/vgg19.params
 OUTPUT_DIR=$MXNET_ROOT/example/neural-style/output
 
 java -Xmx1024m -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.neuralstyle.NeuralStyle \
+	ml.dmlc.mxnetexamples.neuralstyle.NeuralStyle \
 	--content-image $INPUT_IMG  \
 	--style-image  $STYLE_IMG \
 	--model-path  $MODEL_PATH \
diff --git a/scala-package/examples/scripts/run_visualization.sh b/scala-package/examples/scripts/run_visualization.sh
index 9a9e1ae41649..6f686adc06f5 100644
--- a/scala-package/examples/scripts/run_visualization.sh
+++ b/scala-package/examples/scripts/run_visualization.sh
@@ -13,6 +13,6 @@ OUT_DIR=$1
 NET=$2
 
 java -Xmx1024m -cp $CLASS_PATH \
-	ml.dmlc.mxnet.examples.visualization.ExampleVis \
+	ml.dmlc.mxnetexamples.visualization.ExampleVis \
 	--out-dir $OUT_DIR  \
 	--net $NET 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/cnntextclassification/CNNTextClassification.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/cnntextclassification/CNNTextClassification.scala
index a8c5751a0db5..582cbb722b12 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/CNNTextClassification.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/cnntextclassification/CNNTextClassification.scala
@@ -15,12 +15,11 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.cnnclassification
+package ml.dmlc.mxnetexamples.cnntextclassification
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
 import scala.collection.JavaConverters._
-import ml.dmlc.mxnet.Initializer
 import ml.dmlc.mxnet.Uniform
 import ml.dmlc.mxnet.Context
 import ml.dmlc.mxnet.Symbol
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/DataHelper.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/cnntextclassification/DataHelper.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/DataHelper.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/cnntextclassification/DataHelper.scala
index 819a5cade4f2..d1e67131c256 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/cnntextclassification/DataHelper.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/cnntextclassification/DataHelper.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.cnnclassification
+package ml.dmlc.mxnetexamples.cnntextclassification
 
 import scala.io.Source
 import java.io.BufferedInputStream
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/Data.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/Data.scala
similarity index 97%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/Data.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/Data.scala
index 7deb5a8797a7..d813130c001f 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/Data.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/Data.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.customop
+package ml.dmlc.mxnetexamples.customop
 
 import ml.dmlc.mxnet.Shape
 import ml.dmlc.mxnet.IO
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/ExampleCustomOp.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/ExampleCustomOp.scala
similarity index 97%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/ExampleCustomOp.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/ExampleCustomOp.scala
index b75aa5c43ce3..9dad90c45aec 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/ExampleCustomOp.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/ExampleCustomOp.scala
@@ -15,12 +15,13 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.customop
+package ml.dmlc.mxnetexamples.customop
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
 import scala.collection.JavaConverters._
 import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.DType.DType
 import ml.dmlc.mxnet.DataIter
 import ml.dmlc.mxnet.DataBatch
 import ml.dmlc.mxnet.NDArray
@@ -92,6 +93,11 @@ object ExampleCustomOp {
       (Array(dataShape, labelShape), Array(outputShape), null)
     }
 
+    override def inferType(inType: Array[DType]):
+      (Array[DType], Array[DType], Array[DType]) = {
+      (inType, inType.take(1), null)
+    }
+
     override def createOperator(ctx: String, inShapes: Array[Array[Int]],
       inDtypes: Array[Int]): CustomOp = new Softmax(this.kwargs)
   }
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/ExampleCustomOpWithRtc.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/ExampleCustomOpWithRtc.scala
similarity index 97%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/ExampleCustomOpWithRtc.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/ExampleCustomOpWithRtc.scala
index 0bc144a4fe79..12dc0d50ae3c 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/customop/ExampleCustomOpWithRtc.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/customop/ExampleCustomOpWithRtc.scala
@@ -15,12 +15,13 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.customop
+package ml.dmlc.mxnetexamples.customop
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
 import scala.collection.JavaConverters._
 import ml.dmlc.mxnet.Symbol
+import ml.dmlc.mxnet.DType.DType
 import ml.dmlc.mxnet.DataIter
 import ml.dmlc.mxnet.DataBatch
 import ml.dmlc.mxnet.NDArray
@@ -114,6 +115,11 @@ object ExampleCustomOpWithRtc {
       (Array(dataShape, labelShape), Array(outputShape), null)
     }
 
+    override def inferType(inType: Array[DType]):
+      (Array[DType], Array[DType], Array[DType]) = {
+      (inType, inType.take(1), null)
+    }
+
     override def createOperator(ctx: String, inShapes: Array[Array[Int]],
       inDtypes: Array[Int]): CustomOp = new Softmax(this.kwargs)
   }
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/GanMnist.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/GanMnist.scala
similarity index 96%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/GanMnist.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/GanMnist.scala
index de9cac3cdbe1..55e5c3aaaa8f 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/GanMnist.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/GanMnist.scala
@@ -15,12 +15,12 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.gan
+package ml.dmlc.mxnetexamples.gan
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
 import scala.collection.JavaConverters._
-import ml.dmlc.mxnet.examples.gan.Viz._
+import Viz._
 import ml.dmlc.mxnet.Context
 import ml.dmlc.mxnet.Shape
 import ml.dmlc.mxnet.IO
@@ -121,7 +121,11 @@ object GanMnist {
     val parser: CmdLineParser = new CmdLineParser(anst)
     try {
       parser.parseArgument(args.toList.asJava)
-      assert(anst.mnistDataPath != null)
+
+      val dataPath = if (anst.mnistDataPath == null) System.getenv("MXNET_DATA_DIR")
+        else anst.mnistDataPath
+
+      assert(dataPath != null)
 
       val lr = 0.0005f
       val beta1 = 0.5f
@@ -147,8 +151,8 @@ object GanMnist {
       gMod.initOptimizer(new Adam(learningRate = lr, wd = 0f, beta1 = beta1))
 
       val params = Map(
-        "image" -> s"${anst.mnistDataPath}/train-images-idx3-ubyte",
-        "label" -> s"${anst.mnistDataPath}/train-labels-idx1-ubyte",
+        "image" -> s"${dataPath}/train-images-idx3-ubyte",
+        "label" -> s"${dataPath}/train-labels-idx1-ubyte",
         "input_shape" -> s"(1, 28, 28)",
         "batch_size" -> s"$batchSize",
         "shuffle" -> "True"
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/Module.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/Module.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/Module.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/Module.scala
index 6ba2518ea9e6..f8ea2c569336 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/Module.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/Module.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.gan
+package ml.dmlc.mxnetexamples.gan
 
 import ml.dmlc.mxnet.Symbol
 import ml.dmlc.mxnet.Context
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/Viz.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/Viz.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/Viz.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/Viz.scala
index 4bae00f2721d..51d203e68287 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/gan/Viz.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/gan/Viz.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.gan
+package ml.dmlc.mxnetexamples.gan
 
 import org.opencv.core.Core
 import org.opencv.highgui.Highgui
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/imclassification/ModelTrain.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/imclassification/ModelTrain.scala
index 0f21d5a16892..d06251cadd54 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/ModelTrain.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/imclassification/ModelTrain.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.imclassification
+package ml.dmlc.mxnetexamples.imclassification
 
 import ml.dmlc.mxnet.Callback.Speedometer
 import ml.dmlc.mxnet._
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/imclassification/TrainMnist.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/imclassification/TrainMnist.scala
index 58bc4bb0b399..1f2bc7ae2ca5 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/imclassification/TrainMnist.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/imclassification/TrainMnist.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.imclassification
+package ml.dmlc.mxnetexamples.imclassification
 
 import ml.dmlc.mxnet._
 import org.kohsuke.args4j.{CmdLineParser, Option}
@@ -102,6 +102,9 @@ object TrainMnist {
     try {
       parser.parseArgument(args.toList.asJava)
 
+      val dataPath = if (inst.dataDir == null) System.getenv("MXNET_DATA_DIR")
+        else inst.dataDir
+
       val (dataShape, net) =
         if (inst.network == "mlp") (Shape(784), getMlp)
         else (Shape(1, 28, 28), getLenet)
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/module/MnistMlp.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/module/MnistMlp.scala
new file mode 100644
index 000000000000..8279936a2974
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/module/MnistMlp.scala
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnetexamples.module
+
+import ml.dmlc.mxnet._
+import ml.dmlc.mxnet.module.{FitParams, Module}
+import ml.dmlc.mxnet.DataDesc._
+import ml.dmlc.mxnet.optimizer.SGD
+import org.kohsuke.args4j.{Option, CmdLineParser}
+import org.slf4j.LoggerFactory
+
+import scala.collection.JavaConverters._
+
+object MnistMlp {
+  private val logger = LoggerFactory.getLogger(classOf[MnistMlp])
+
+  def getSymbol: Symbol = {
+    val data = Symbol.Variable("data")
+    val fc1 = Symbol.FullyConnected(name = "fc1")(data)(Map("num_hidden" -> 128))
+    val act1 = Symbol.Activation(name = "relu1")(fc1)(Map("act_type" -> "relu"))
+    val fc2 = Symbol.FullyConnected(name = "fc2")(act1)(Map("num_hidden" -> 64))
+    val act2 = Symbol.Activation(name = "relu2")(fc2)(Map("act_type" -> "relu"))
+    val fc3 = Symbol.FullyConnected(name = "fc3")(act2)(Map("num_hidden" -> 10))
+    val softmax = Symbol.SoftmaxOutput(name = "softmax")(fc3)()
+    softmax
+  }
+
+  def runIntermediateLevelApi(train: DataIter, eval: DataIter,
+      cmdLine: MnistMlp, loadModelEpoch: Int = -1): Unit = {
+    // Intermediate-level API
+    val mod = if (loadModelEpoch == -1) {
+      new Module(getSymbol)
+    } else {
+      logger.info("Load checkpoint from epoch {}", loadModelEpoch)
+      Module.loadCheckpoint("model/mnist_mlp", loadModelEpoch, loadOptimizerStates = true)
+    }
+    mod.bind(dataShapes = train.provideData, labelShapes = Some(train.provideLabel))
+    mod.initParams()
+    mod.initOptimizer(optimizer = new SGD(learningRate = 0.01f, momentum = 0.9f))
+
+    val metric = new Accuracy()
+
+    for (epoch <- 0 until cmdLine.numEpoch) {
+      while (train.hasNext) {
+        val batch = train.next()
+        mod.forward(batch)
+        mod.updateMetric(metric, batch.label)
+        mod.backward()
+        mod.update()
+      }
+
+      mod.saveCheckpoint("model/mnist_mlp", epoch, saveOptStates = true)
+
+      val (name, value) = metric.get
+      logger.info(s"epoch $epoch $name=$value")
+      metric.reset()
+      train.reset()
+    }
+  }
+
+  def runHighLevelApi(train: DataIter, test: DataIter, cmdLine: MnistMlp): Unit = {
+    // High-level API
+    train.reset()
+    val mod = new Module(getSymbol)
+    mod.fit(train, evalData = scala.Option(test), numEpoch = cmdLine.numEpoch)
+
+    // prediction iterator API
+    var iBatch = 0
+    test.reset()
+    while (test.hasNext) {
+      val batch = test.next()
+      val preds = mod.predict(batch)
+      val predLabel: Array[Int] = NDArray.argmax_channel(preds(0)).toArray.map(_.toInt)
+      val label = batch.label(0).toArray.map(_.toInt)
+      val acc = predLabel.zip(label).map { case (py, y) =>
+        if (py == y) 1 else 0
+      }.sum / predLabel.length.toFloat
+      if (iBatch % 20 == 0) {
+        logger.info(s"Batch $iBatch acc: $acc")
+      }
+      iBatch += 1
+    }
+
+    // a dummy call just to test if the API works
+    mod.predict(test)
+
+    // perform prediction and calculate accuracy manually
+    val preds = mod.predictEveryBatch(test)
+    test.reset()
+    var accSum = 0.0f
+    var accCnt = 0
+    var i = 0
+    while (test.hasNext) {
+      val batch = test.next()
+      val predLabel: Array[Int] = NDArray.argmax_channel(preds(i)(0)).toArray.map(_.toInt)
+      val label = batch.label(0).toArray.map(_.toInt)
+      accSum += (predLabel zip label).map { case (py, y) =>
+        if (py == y) 1 else 0
+      }.sum
+      accCnt += predLabel.length
+      i += 1
+    }
+    logger.info(s"Validation Accuracy: {}", accSum / accCnt.toFloat)
+
+    // evaluate on validation set with a evaluation metric
+    val (name, value) = mod.score(test, new Accuracy).get
+    logger.info("Scored {} = {}", name, value)
+  }
+
+  def main(args: Array[String]): Unit = {
+    val inst = new MnistMlp
+    val parser = new CmdLineParser(inst)
+    try {
+      parser.parseArgument(args.toList.asJava)
+
+      val train = IO.MNISTIter(Map(
+        "image" -> (inst.dataDir + "train-images-idx3-ubyte"),
+        "label" -> (inst.dataDir + "train-labels-idx1-ubyte"),
+        "label_name" -> "softmax_label",
+        "input_shape" -> "(784,)",
+        "batch_size" -> inst.batchSize.toString,
+        "shuffle" -> "True",
+        "flat" -> "True", "silent" -> "False", "seed" -> "10"))
+      val eval = IO.MNISTIter(Map(
+        "image" -> (inst.dataDir + "t10k-images-idx3-ubyte"),
+        "label" -> (inst.dataDir + "t10k-labels-idx1-ubyte"),
+        "label_name" -> "softmax_label",
+        "input_shape" -> "(784,)",
+        "batch_size" -> inst.batchSize.toString,
+        "flat" -> "True", "silent" -> "False"))
+
+      logger.info("Run intermediate level api from beginning.")
+      runIntermediateLevelApi(train, eval, inst)
+      logger.info("Run intermediate level api, start with last trained epoch.")
+      runIntermediateLevelApi(train, eval, inst, loadModelEpoch = inst.numEpoch - 1)
+      logger.info("Run high level api")
+      runHighLevelApi(train, eval, inst)
+    } catch {
+      case ex: Exception =>
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+    }
+  }
+}
+
+class MnistMlp {
+  @Option(name = "--data-dir", usage = "the input data directory")
+  private val dataDir: String = "mnist/"
+  @Option(name = "--batch-size", usage = "the batch size for data iterator")
+  private val batchSize: Int = 2
+  @Option(name = "--num-epoch", usage = "number of training epoches")
+  private val numEpoch: Int = 10
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/module/SequentialModuleEx.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/module/SequentialModuleEx.scala
new file mode 100644
index 000000000000..c6676ccf60f5
--- /dev/null
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/module/SequentialModuleEx.scala
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnetexamples.module
+
+import ml.dmlc.mxnet._
+import ml.dmlc.mxnet.module.{FitParams, Module, SequentialModule}
+import ml.dmlc.mxnet.DataDesc._
+import ml.dmlc.mxnet.optimizer.SGD
+import org.kohsuke.args4j.{Option, CmdLineParser}
+import org.slf4j.LoggerFactory
+import scala.collection.JavaConverters._
+
+object SequentialModuleEx {
+  private val logger = LoggerFactory.getLogger(classOf[SequentialModuleEx])
+
+  def getSeqModule(): SequentialModule = {
+    val contexts = Array(Context.cpu(), Context.cpu())
+
+    // module1
+    val data = Symbol.Variable("data")
+    val fc1 = Symbol.FullyConnected("fc1")()(Map("data" -> data, "num_hidden" -> 128))
+    val act1 = Symbol.Activation("relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
+
+    val mod1 = new Module(act1, labelNames = null, contexts = contexts(0))
+
+    // module2
+    val data2 = Symbol.Variable("data")
+    val fc2 = Symbol.FullyConnected("fc2")()(Map("data" -> data2, "num_hidden" -> 64))
+    val act2 = Symbol.Activation("relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
+    val fc3 = Symbol.FullyConnected("fc3")()(Map("data" -> act2, "num_hidden" -> 10))
+    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc3))
+
+    val mod2 = new Module(softmax, contexts = contexts(1))
+
+    // Container module
+    val modSeq = new SequentialModule()
+    modSeq.add(mod1).add(mod2, ("take_labels", true), ("auto_wiring", true))
+    modSeq
+  }
+
+  def runIntermediateLevelApi(train: DataIter, eval: DataIter,
+    cmdLine: SequentialModuleEx): Unit = {
+    // Intermediate-level API
+    val modSeq = getSeqModule()
+    modSeq.bind(dataShapes = train.provideData, labelShapes = Some(train.provideLabel))
+    if (cmdLine.loadModelPath != null) {
+      logger.info(s"Load checkpoint from ${cmdLine.loadModelPath}")
+      modSeq.loadParams(cmdLine.loadModelPath)
+     } else modSeq.initParams()
+
+     modSeq.initOptimizer(optimizer = new SGD(learningRate = cmdLine.lr, momentum = 0.9f))
+
+    val metric = new Accuracy()
+
+    for (epoch <- 0 until cmdLine.numEpoch) {
+      while (train.hasNext) {
+        val batch = train.next()
+        modSeq.forward(batch)
+        modSeq.updateMetric(metric, batch.label)
+        modSeq.backward()
+        modSeq.update()
+      }
+
+      val fname = "%s-%04d.params".format(s"${cmdLine.saveModelPath}/seqModule", epoch)
+      modSeq.saveParams(fname)
+
+      val (name, value) = metric.get
+      logger.info(s"epoch $epoch $name=$value")
+      metric.reset()
+      train.reset()
+    }
+  }
+
+  def runHighLevelApi(train: DataIter, test: DataIter, cmdLine: SequentialModuleEx): Unit = {
+    // High-level API
+    train.reset()
+    val modSeq = getSeqModule()
+    val fitParams = new FitParams
+    fitParams.setOptimizer(new SGD(learningRate = cmdLine.lr, momentum = 0.9f))
+    modSeq.fit(train, evalData = scala.Option(test),
+        numEpoch = cmdLine.numEpoch, fitParams = fitParams)
+
+    // prediction iterator API
+    var iBatch = 0
+    test.reset()
+    while (test.hasNext) {
+      val batch = test.next()
+      val preds = modSeq.predict(batch)
+      val predLabel: Array[Int] = NDArray.argmax_channel(preds(0)).toArray.map(_.toInt)
+      val label = batch.label(0).toArray.map(_.toInt)
+      val acc = predLabel.zip(label).map { case (py, y) =>
+        if (py == y) 1 else 0
+      }.sum / predLabel.length.toFloat
+      if (iBatch % 20 == 0) {
+        logger.info(s"Batch $iBatch acc: $acc")
+      }
+      iBatch += 1
+    }
+
+    // a dummy call just to test if the API works
+    modSeq.predict(test)
+
+    // perform prediction and calculate accuracy manually
+    val preds = modSeq.predictEveryBatch(test)
+    test.reset()
+    var accSum = 0.0f
+    var accCnt = 0
+    var i = 0
+    while (test.hasNext) {
+      val batch = test.next()
+      val predLabel: Array[Int] = NDArray.argmax_channel(preds(i)(0)).toArray.map(_.toInt)
+      val label = batch.label(0).toArray.map(_.toInt)
+      accSum += (predLabel zip label).map { case (py, y) =>
+        if (py == y) 1 else 0
+      }.sum
+      accCnt += predLabel.length
+      i += 1
+    }
+    logger.info(s"Validation Accuracy: ${accSum / accCnt.toFloat}")
+
+    // evaluate on validation set with a evaluation metric
+    val (name, value) = modSeq.score(test, new Accuracy).get
+    logger.info(s"Scored $name = $value")
+  }
+
+  def main(args: Array[String]): Unit = {
+    val alex = new SequentialModuleEx
+    val parser = new CmdLineParser(alex)
+    try {
+      parser.parseArgument(args.toList.asJava)
+      require(alex.dataDir != null)
+
+      val trainDataIter = IO.MNISTIter(Map(
+        "image" -> s"${alex.dataDir}/train-images-idx3-ubyte",
+        "label" -> s"${alex.dataDir}/train-labels-idx1-ubyte",
+        "label_name" -> "softmax_label",
+        "input_shape" -> "(784,)",
+        "batch_size" -> alex.batchSize.toString,
+        "shuffle" -> "True",
+        "flat" -> "True", "silent" -> "False", "seed" -> "10"))
+      val evalDataIter = IO.MNISTIter(Map(
+        "image" -> s"${alex.dataDir}/t10k-images-idx3-ubyte",
+        "label" -> s"${alex.dataDir}/t10k-labels-idx1-ubyte",
+        "label_name" -> "softmax_label",
+        "input_shape" -> "(784,)",
+        "batch_size" -> alex.batchSize.toString,
+        "flat" -> "True", "silent" -> "False"))
+
+      logger.info("Run intermediate level api from beginning.")
+      runIntermediateLevelApi(trainDataIter, evalDataIter, alex)
+      logger.info("Run high level api")
+      runHighLevelApi(trainDataIter, evalDataIter, alex)
+
+    } catch {
+      case ex: Exception =>
+        logger.error(ex.getMessage, ex)
+        parser.printUsage(System.err)
+        sys.exit(1)
+    }
+  }
+}
+
+class SequentialModuleEx {
+  @Option(name = "--data-dir", usage = "the input data directory")
+  private val dataDir: String = null
+  @Option(name = "--lr", usage = "the initial learning rate")
+  private val lr: Float = 0.01f
+  @Option(name = "--batch-size", usage = "the batch size for data iterator")
+  private val batchSize: Int = 100
+  @Option(name = "--num-epoch", usage = "number of training epoches")
+  private val numEpoch: Int = 100
+  @Option(name = "--save-model-path", usage = "the model saving path")
+  private val saveModelPath: String = ""
+  @Option(name = "--load-model-path", usage = "the model to be loaded")
+  private val loadModelPath: String = null
+}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/Data.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/multitask/Data.scala
similarity index 97%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/Data.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/multitask/Data.scala
index 3b3c3330bea7..27a542b081c3 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/Data.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/multitask/Data.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.multitask
+package ml.dmlc.mxnetexamples.multitask
 
 import ml.dmlc.mxnet.Shape
 import ml.dmlc.mxnet.IO
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/ExampleMultiTask.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/multitask/ExampleMultiTask.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/ExampleMultiTask.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/multitask/ExampleMultiTask.scala
index 1eb1ac1c17b2..9d2ff9d476cf 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/multitask/ExampleMultiTask.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/multitask/ExampleMultiTask.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.multitask
+package ml.dmlc.mxnetexamples.multitask
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/ModelVgg19.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/ModelVgg19.scala
index a7058e2bbff4..0af6c7c65acf 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/ModelVgg19.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/ModelVgg19.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle
+package ml.dmlc.mxnetexamples.neuralstyle
 
 import ml.dmlc.mxnet.Context
 import ml.dmlc.mxnet.Executor
@@ -31,7 +31,7 @@ object ModelVgg19 {
   case class ConvExecutor(executor: Executor, data: NDArray, dataGrad: NDArray,
                       style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
 
-  def getSymbol(): (Symbol, Symbol) = {
+  def getSymbol: (Symbol, Symbol) = {
     // declare symbol
     val data = Symbol.Variable("data")
     val conv1_1 = Symbol.Convolution("conv1_1")()(Map("data" -> data , "num_filter" -> 64,
@@ -127,7 +127,7 @@ object ModelVgg19 {
     }
 
   def getModel(modelPath: String, inputSize: (Int, Int), ctx: Context): ConvExecutor = {
-    val (style, content) = getSymbol()
+    val (style, content) = getSymbol
     getExecutor(style, content, modelPath, inputSize, ctx)
   }
 }
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/NeuralStyle.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/NeuralStyle.scala
index b5f9c702a30f..4ac9aa52e77f 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/NeuralStyle.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/NeuralStyle.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle
+package ml.dmlc.mxnetexamples.neuralstyle
 
 import ml.dmlc.mxnet._
 import org.kohsuke.args4j.{CmdLineParser, Option}
@@ -169,7 +169,7 @@ object NeuralStyle {
       val styleNp = preprocessStyleImage(alle.styleImage, contentNp.shape, dev)
       val size = (contentNp.shape(2), contentNp.shape(3))
 
-      val (style, content) = ModelVgg19.getSymbol()
+      val (style, content) = ModelVgg19.getSymbol
       val (gram, gScale) = styleGramSymbol(size, style)
       var modelExecutor = ModelVgg19.getExecutor(gram, content, alle.modelPath, size, dev)
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Basic.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/Basic.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Basic.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/Basic.scala
index f5053a3cbd7e..d9af32184d98 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Basic.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/Basic.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import ml.dmlc.mxnet.Shape
 import ml.dmlc.mxnet.Context
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostInference.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/BoostInference.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostInference.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/BoostInference.scala
index bad9f0ffff66..36aa62035efc 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostInference.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/BoostInference.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import org.slf4j.LoggerFactory
 import org.kohsuke.args4j.{CmdLineParser, Option}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostTrain.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostTrain.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
index a8a77acddde0..844adec37318 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/BoostTrain.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/BoostTrain.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import org.slf4j.LoggerFactory
 import org.kohsuke.args4j.{CmdLineParser, Option}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/DataProcessing.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/DataProcessing.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
index c20af81f6bcc..d02436ffc93e 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/DataProcessing.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/DataProcessing.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import com.sksamuel.scrimage.Image
 import com.sksamuel.scrimage.Pixel
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV3.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/GenV3.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV3.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/GenV3.scala
index 98ea3b8e7662..f4601d17cb0e 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV3.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/GenV3.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import ml.dmlc.mxnet.Symbol
 import ml.dmlc.mxnet.Shape
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV4.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/GenV4.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV4.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/GenV4.scala
index 35c36c04ed8a..ee1e238e0c25 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/GenV4.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/GenV4.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import ml.dmlc.mxnet.Symbol
 import ml.dmlc.mxnet.Shape
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/ModelVgg19.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
similarity index 95%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/ModelVgg19.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
index fa3d101c98d3..518eb352e44f 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/ModelVgg19.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/ModelVgg19.scala
@@ -15,20 +15,14 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import ml.dmlc.mxnet.Executor
 import ml.dmlc.mxnet.NDArray
 import ml.dmlc.mxnet.Symbol
-import ml.dmlc.mxnet.Context
-import ml.dmlc.mxnet.Shape
 
-/**
- * @author Depeng Liang
- */
+
 object ModelVgg19 {
-  case class ConvExecutor(executor: Executor, data: NDArray, dataGrad: NDArray,
-                      style: Array[NDArray], content: NDArray, argDict: Map[String, NDArray])
 
   def getVggSymbol(prefix: String, contentOnly: Boolean = false): (Symbol, Symbol) = {
     // declare symbol
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Module.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/Module.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Module.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/Module.scala
index d0ac91731987..313a80a2103d 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/neuralstyle/end2end/Module.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/neuralstyle/end2end/Module.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.neuralstyle.end2end
+package ml.dmlc.mxnetexamples.neuralstyle.end2end
 
 import ml.dmlc.mxnet.Context
 import org.slf4j.LoggerFactory
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/profiler/ProfilerMatMul.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/profiler/ProfilerMatMul.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/profiler/ProfilerMatMul.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/profiler/ProfilerMatMul.scala
index 7721ef44718c..9551e4a66111 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/profiler/ProfilerMatMul.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/profiler/ProfilerMatMul.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.profiler
+package ml.dmlc.mxnetexamples.profiler
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/profiler/ProfilerNDArray.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/profiler/ProfilerNDArray.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/profiler/ProfilerNDArray.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/profiler/ProfilerNDArray.scala
index 05cc14dc406d..d16fbb1d8ac0 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/profiler/ProfilerNDArray.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/profiler/ProfilerNDArray.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.profiler
+package ml.dmlc.mxnetexamples.profiler
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/BucketIo.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/BucketIo.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/BucketIo.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/BucketIo.scala
index 9f2a9032846c..f0cf707ed8af 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/BucketIo.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/BucketIo.scala
@@ -16,7 +16,7 @@
  */
 
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import ml.dmlc.mxnet.{DataBatch, DataIter, NDArray, Shape}
 import org.slf4j.LoggerFactory
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala
index ca7196e9e24d..fe900a3bd0b4 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Lstm.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala
@@ -16,7 +16,7 @@
  */
 
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/LstmBucketing.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala
similarity index 97%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/LstmBucketing.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala
index f7bff7019731..a49044bad268 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/LstmBucketing.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala
@@ -16,11 +16,11 @@
  */
 
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import ml.dmlc.mxnet.Callback.Speedometer
 import ml.dmlc.mxnet._
-import ml.dmlc.mxnet.examples.rnn.BucketIo.BucketSentenceIter
+import BucketIo.BucketSentenceIter
 import ml.dmlc.mxnet.optimizer.SGD
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.{Logger, LoggerFactory}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala
index b25206d78906..b2188f81eb11 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/RnnModel.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala
@@ -16,7 +16,7 @@
  */
 
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import ml.dmlc.mxnet.Context
 import ml.dmlc.mxnet.NDArray
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TestCharRnn.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TestCharRnn.scala
index 890b521bc43a..c52f3078b780 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TestCharRnn.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TestCharRnn.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import ml.dmlc.mxnet._
 import org.kohsuke.args4j.{CmdLineParser, Option}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala
index 70b85a4a5408..297225b060be 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/TrainCharRnn.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import ml.dmlc.mxnet._
 import org.kohsuke.args4j.{CmdLineParser, Option}
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Utils.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Utils.scala
index 0462f3abede4..b4cb14a94360 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/rnn/Utils.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Utils.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.rnn
+package ml.dmlc.mxnetexamples.rnn
 
 import scala.io.Source
 import ml.dmlc.mxnet.EvalMetric
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/AlexNet.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/AlexNet.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/AlexNet.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/AlexNet.scala
index 526c3671cfcb..c68bc90d25c9 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/AlexNet.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/AlexNet.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ExampleVis.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/ExampleVis.scala
similarity index 98%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ExampleVis.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/ExampleVis.scala
index aedf3d2365ce..af23c8a128dc 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ExampleVis.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/ExampleVis.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/GoogleNet.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/GoogleNet.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/GoogleNet.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/GoogleNet.scala
index da8e4667d83d..160e9188d8d8 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/GoogleNet.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/GoogleNet.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_BN.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/Inception_BN.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_BN.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/Inception_BN.scala
index dcf97fca7cab..37ebc3c7d951 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_BN.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/Inception_BN.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_V3.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/Inception_V3.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_V3.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/Inception_V3.scala
index 02b7cd612bc7..84233496b54f 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/Inception_V3.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/Inception_V3.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/LeNet.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/LeNet.scala
similarity index 97%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/LeNet.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/LeNet.scala
index 06a352a11707..de53d4af0c1d 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/LeNet.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/LeNet.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ResNet_Small.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/ResNet_Small.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ResNet_Small.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/ResNet_Small.scala
index bebdc6eb8f1c..6118dae1304a 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/ResNet_Small.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/ResNet_Small.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/VGG.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/VGG.scala
similarity index 99%
rename from scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/VGG.scala
rename to scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/VGG.scala
index a692844676a7..a48fc80db502 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnet/examples/visualization/VGG.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/visualization/VGG.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package ml.dmlc.mxnet.examples.visualization
+package ml.dmlc.mxnetexamples.visualization
 
 import ml.dmlc.mxnet.Symbol
 
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 923b20da3926..0589b62e7a3d 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -57,15 +57,16 @@
           </compilerStartOptions>
           <compilerEndOptions>
             <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>${all_includes}</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
           <linkerStartOptions>
             <linkerStartOption>-shared</linkerStartOption>
           </linkerStartOptions>
           <linkerMiddleOptions>
+            <linkerMiddleOption>${all_ldpaths}</linkerMiddleOption>
             <linkerMiddleOption>-Wl,--whole-archive</linkerMiddleOption>
-            <linkerMiddleOption>${lddeps}</linkerMiddleOption>
-            <linkerMiddleOption>../../../lib/libmxnet.a</linkerMiddleOption>
+             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
             <linkerMiddleOption>-Wl,--no-whole-archive</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index cdc00652fecb..deb2526ca442 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -58,15 +58,16 @@
           </compilerStartOptions>
           <compilerEndOptions>
             <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>${all_includes}</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
           <linkerStartOptions>
             <linkerStartOption>-shared</linkerStartOption>
           </linkerStartOptions>
           <linkerMiddleOptions>
+            <linkerMiddleOption>${all_ldpaths}</linkerMiddleOption>
             <linkerMiddleOption>-Wl,--whole-archive</linkerMiddleOption>
             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
-            <linkerMiddleOption>../../../lib/libmxnet.a</linkerMiddleOption>
             <linkerMiddleOption>-Wl,--no-whole-archive</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index 6925fd225880..6ab3ce3395e7 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -58,15 +58,16 @@
           </compilerStartOptions>
           <compilerEndOptions>
             <compilerEndOption>-I../../../include</compilerEndOption>
+            <compilerEndOption>${all_includes}</compilerEndOption>
             <compilerEndOption>${cflags}</compilerEndOption>
           </compilerEndOptions>
           <linkerStartOptions>
             <linkerStartOption>-shared</linkerStartOption>
           </linkerStartOptions>
           <linkerMiddleOptions>
+            <linkerMiddleOption>${all_ldpaths}</linkerMiddleOption>
             <linkerMiddleOption>-Wl,--whole-archive</linkerMiddleOption>
             <linkerMiddleOption>${lddeps}</linkerMiddleOption>
-            <linkerMiddleOption>../../../lib/libmxnet.a</linkerMiddleOption>
             <linkerMiddleOption>-Wl,--no-whole-archive</linkerMiddleOption>
           </linkerMiddleOptions>
           <linkerEndOptions>
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index ea86664df120..3accefcbffe6 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -8,13 +8,13 @@
 #include <mxnet/c_api.h>
 #include <dmlc/logging.h>
 #include <mxnet/ndarray.h>
+#include <../src/common/cuda_utils.h>
 #include <mutex>
 #include <iostream>
 #include <functional>
 #include <string>
 #include <unordered_map>
 #include "jni_helper_func.h"
- #include "../../../../../src/common/cuda_utils.h"
 
 JavaVM *_jvm;
 
@@ -1779,8 +1779,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
   lock.unlock();
 
   auto creatorLambda = [](const char *opType, const int numKwargs,
-    const char  **keys, const char **values, CustomOpPropInfo *ret) {
-    bool success = true;
+    const char  **keys, const char **values, MXCallbackList *ret) {
+    int success = true;
 
     // set CustomOpProp.kwargs
     std::string opPropKey(opType);
@@ -1818,7 +1818,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
 
     // list_arguments callback
     auto opPropListArgument = [](char ***args, void *state) {
-      bool success = true;
+      int success = true;
       std::string key(reinterpret_cast<char *>(state));
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "CustomOpProp: " << key << " not found";
@@ -1852,7 +1852,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
 
     // list_outputs callback
     auto opPropListOutputs = [](char ***outputs, void *state) {
-      bool success = true;
+      int success = true;
       std::string key(reinterpret_cast<char *>(state));
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "CustomOpProp: " << key << " not found";
@@ -1886,7 +1886,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
 
     // list_auxiliary_states callback
     auto opPropListAuxStates = [](char ***auxs, void *state) {
-      bool success = true;
+      int success = true;
       std::string key(reinterpret_cast<char *>(state));
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "CustomOpProp: " << key << " not found";
@@ -1926,7 +1926,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
     // declare_backward_dependency callback
     auto opPropDeclareBkDep = [](const int *outGrad, const int *inData,
       const int *outData, int *numDeps, int **rdeps, void *state) {
-      bool success = true;
+      int success = true;
       std::string key(reinterpret_cast<char *>(state));
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "CustomOpProp: " << key << " not found";
@@ -1984,7 +1984,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
     // infer_shape callback
     auto opPropInferShape = [](int numInput, int *ndims,
       unsigned **shapes, void *state) {
-      bool success = true;
+      int success = true;
       std::string key(reinterpret_cast<char *>(state));
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "CustomOpProp: " << key << " not found";
@@ -2037,10 +2037,56 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
       return success;
     };
 
+    // infer_type callback
+    auto opPropInferType = [](int numInput, int* types, void* state) {
+      int success = true;
+      std::string key(reinterpret_cast<char *>(state));
+      if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
+        LOG(WARNING) << "CustomOpProp: " << key << " not found";
+        success = false;
+      } else {
+        JNIEnv *env;
+        _jvm->AttachCurrentThread(reinterpret_cast<void **>(&env), NULL);
+        jclass opPropClass = env->GetObjectClass(globalOpPropMap.at(key));
+        jmethodID midInferType = env->GetMethodID(opPropClass, "inferTypeEntry", "(I[I)[I");
+        if (NULL == midInferType) {
+          LOG(WARNING) << "could not find opProp method inferTypeEntry.";
+          success = false;
+        } else {
+          jmethodID midListArguments = env->GetMethodID(
+            opPropClass, "listArguments", "()[Ljava/lang/String;");
+          jobjectArray jargs = (jobjectArray)(env->CallObjectMethod(
+            globalOpPropMap.at(key), midListArguments));
+
+          int intLen = env->GetArrayLength(jargs);
+          jintArray ts = env->NewIntArray(intLen);
+          int *tmp = new int[intLen];
+          for (int i = 0; i < intLen; ++i) tmp[i] = types[i];
+          env->SetIntArrayRegion(ts, (jsize)0, (jsize)intLen, tmp);
+
+          jintArray ret = (jintArray)(env->CallObjectMethod(
+            globalOpPropMap.at(key), midInferType,
+            numInput,
+            ts));
+          jint *arr = env->GetIntArrayElements(ret, NULL);
+          for (int i = 0; i < numInput; ++i) {
+            types[i] = static_cast<int>(arr[i]);
+          }
+
+          delete[] tmp;
+          env->ReleaseIntArrayElements(ret, arr, 0);
+          env->DeleteLocalRef(ret);
+          env->DeleteLocalRef(ts);
+        }
+        _jvm->DetachCurrentThread();
+      }
+      return success;
+    };
+
     // create_operator callback
     auto opPropCreateOp = [](const char *ctx, int numInputs,
-      unsigned **shapes, int *ndims, int *dtypes, CustomOpInfo *ret, void *state) {
-      bool success = true;
+      unsigned **shapes, int *ndims, int *dtypes, MXCallbackList *ret, void *state) {
+      int success = true;
       std::string key(reinterpret_cast<char *>(state));
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "CustomOpProp: " << key << " not found";
@@ -2088,9 +2134,9 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
 
           // forward callback
           auto forwardEntry = [](int size, void **ptrs, int *tags,
-            const int *reqs, const bool isTrain, void *state) {
+            const int *reqs, const int isTrain, void *state) {
             std::string key(reinterpret_cast<char *>(state));
-            bool success = true;
+            int success = true;
             if (globalOpMap.find(key) == globalOpMap.end()) {
               LOG(WARNING) << "op: " << key << " not found";
               success = false;
@@ -2118,12 +2164,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
                 mxnet::NDArray* tmp = reinterpret_cast<mxnet::NDArray*>(ptrs[0]);
                 CUDA_CALL(cudaSetDevice(tmp->ctx().dev_id));
 #endif
+                bool is_train =  true;
+                if (isTrain == 0) is_train = false;
                 success = env->CallBooleanMethod(globalOpMap.at(key), midForward,
                                                        size,
                                                        ptrsArr,
                                                        tagsArr,
                                                        reqsArr,
-                                                       *(const_cast<bool*>(&isTrain)));
+                                                       is_train);
                 env->DeleteLocalRef(tagsArr);
                 env->DeleteLocalRef(reqsArr);
                 env->DeleteLocalRef(ptrsArr);
@@ -2135,9 +2183,9 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
 
           // backward callback
           auto backwardEntry = [](int size, void **ptrs, int *tags,
-            const int *reqs, const bool isTrain, void *state) {
+            const int *reqs, const int isTrain, void *state) {
             std::string key(reinterpret_cast<char *>(state));
-            bool success = true;
+            int success = true;
             if (globalOpMap.find(key) == globalOpMap.end()) {
               LOG(WARNING) << "op: " << key << " not found";
               success = false;
@@ -2162,12 +2210,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
                 jlongArray ptrsArr = env->NewLongArray(size);
                 env->SetLongArrayRegion(
                   ptrsArr, (jsize)0, (jsize)size, reinterpret_cast<jlong*>(ptrs));
+                bool is_train =  true;
+                if (isTrain == 0) is_train = false;
                 success = env->CallBooleanMethod(globalOpMap.at(key), midBackward,
                                                        size,
                                                        ptrsArr,
                                                        tagsArr,
                                                        reqsArr,
-                                                       *(const_cast<bool*>(&isTrain)));
+                                                       is_train);
                 env->DeleteLocalRef(tagsArr);
                 env->DeleteLocalRef(reqsArr);
                 env->DeleteLocalRef(ptrsArr);
@@ -2180,7 +2230,7 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
           // del callback
           auto delEntry = [](void *state) {
             std::string key(reinterpret_cast<char *>(state));
-            bool success = true;
+            int success = true;
             std::unique_lock<std::mutex> lock(mutex_op);
             if (globalOpMap.find(key) == globalOpMap.end()) {
               LOG(WARNING) << "op: " << key << " not found";
@@ -2202,14 +2252,23 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
             return success;
           };
 
-          ret->forward =
-            static_cast<bool(*)(int, void**, int*, const int*, const bool, void*)>(forwardEntry);
-          ret->backward =
-            static_cast<bool(*)(int, void**, int*, const int*, const bool, void*)>(backwardEntry);
-          ret->del = static_cast<bool(*)(void*)>(delEntry);
-          ret->p_forward = state;
-          ret->p_backward = state;
-          ret->p_del = state;
+          // TODO(eric): Memory leak here. Refactor later and delete in delEntry
+          ret->num_callbacks = 3;
+          ret->callbacks = new MXGenericCallback[ret->num_callbacks];
+          ret->callbacks[kCustomOpDelete] =
+            reinterpret_cast<int(*)(void)>(static_cast<int(*)(void*)>(delEntry));
+          ret->callbacks[kCustomOpForward] =
+            reinterpret_cast<int(*)(void)>(
+              static_cast<int(*)(int, void**, int*, const int*, const int, void*)>(
+                forwardEntry));
+          ret->callbacks[kCustomOpBackward] =
+            reinterpret_cast<int(*)(void)>(
+              static_cast<int(*)(int, void**, int*, const int*, const int, void*)>(
+                backwardEntry));
+          ret->contexts = new void*[ret->num_callbacks];
+          ret->contexts[kCustomOpDelete] = state;
+          ret->contexts[kCustomOpForward] = state;
+          ret->contexts[kCustomOpBackward] = state;
         }
       }
       return success;
@@ -2222,9 +2281,9 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
       int count_prop = globalOpPropCountMap.at(key);
       if (count_prop < 2) {
         globalOpPropCountMap[key] = ++count_prop;
-        return true;
+        return 1;
       }
-      bool success = true;
+      int success = true;
       if (globalOpPropMap.find(key) == globalOpPropMap.end()) {
         LOG(WARNING) << "opProp: " << key << " not found";
         success = false;
@@ -2252,30 +2311,58 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
       return success;
     };
 
-    ret->list_arguments = static_cast<bool(*)(char***, void*)>(opPropListArgument);
-    ret->list_outputs = static_cast<bool(*)(char***, void*)>(opPropListOutputs);
-    ret->infer_shape = static_cast<bool (*)(int, int*, unsigned**, void*)>(opPropInferShape);
-    ret->declare_backward_dependency =
-      static_cast<bool(*)(const int*, const int*, const int*, int* num_deps, int**, void*)>(
-        opPropDeclareBkDep);
-    ret->create_operator =
-      static_cast<bool(*)(const char*, int, unsigned**, int*, int*, CustomOpInfo*, void*)>(
-        opPropCreateOp);
-    ret->list_auxiliary_states = static_cast<bool(*)(char***, void*)>(opPropListAuxStates);
-    ret->del = static_cast<bool(*)(void*)>(opPropDel);
-    ret->p_list_arguments = reinterpret_cast<void *>(const_cast<char *>(opType));
-    ret->p_list_outputs = reinterpret_cast<void *>(const_cast<char *>(opType));
-    ret->p_infer_shape = reinterpret_cast<void *>(const_cast<char *>(opType));
-    ret->p_declare_backward_dependency = reinterpret_cast<void *>(const_cast<char *>(opType));
-    ret->p_create_operator = reinterpret_cast<void *>(const_cast<char *>(opType));
-    ret->p_list_auxiliary_states = reinterpret_cast<void *>(const_cast<char *>(opType));
-    ret->p_del = reinterpret_cast<void *>(const_cast<char *>(opType));
-
+    // TODO(eric): Memory leak. Missing infertype.
+    ret->num_callbacks = 8;
+    ret->callbacks = new MXGenericCallback[ret->num_callbacks];
+    ret->callbacks[kCustomOpPropDelete] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(void*)>(opPropDel));
+    ret->callbacks[kCustomOpPropListArguments] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(char***, void*)>(opPropListArgument));
+    ret->callbacks[kCustomOpPropListOutputs] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(char***, void*)>(opPropListOutputs));
+    ret->callbacks[kCustomOpPropListAuxiliaryStates] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(char***, void*)>(opPropListAuxStates));
+    ret->callbacks[kCustomOpPropInferShape] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int (*)(int, int*, unsigned**, void*)>(opPropInferShape));
+    ret->callbacks[kCustomOpPropDeclareBackwardDependency] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(const int*, const int*, const int*, int* num_deps, int**, void*)>(
+          opPropDeclareBkDep));
+    ret->callbacks[kCustomOpPropCreateOperator] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(const char*, int, unsigned**, int*, int*, MXCallbackList*, void*)>(
+          opPropCreateOp));
+    ret->callbacks[kCustomOpPropInferType] =
+      reinterpret_cast<int(*)(void)>(
+        static_cast<int(*)(int, int*, void*)>(opPropInferType));
+
+    ret->contexts = new void*[ret->num_callbacks];
+    ret->contexts[kCustomOpPropDelete] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropListArguments] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropListOutputs] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropListAuxiliaryStates] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropInferShape] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropDeclareBackwardDependency] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropCreateOperator] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
+    ret->contexts[kCustomOpPropInferType] =
+      reinterpret_cast<void *>(const_cast<char *>(opType));
     return success;
   };
 
   CustomOpPropCreator creator =
-    static_cast<bool(*)(const char*, const int, const char**, const char**, CustomOpPropInfo*)>(
+    static_cast<int(*)(const char*, const int, const char**, const char**, MXCallbackList*)>(
       creatorLambda);
   return MXCustomOpRegister(regName, creator);
 }
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index ccf2c68e48c3..9df7c5223fe1 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -200,6 +200,11 @@
           <skipTests>true</skipTests>
         </configuration>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-install-plugin</artifactId>
+        <version>2.5.2</version>
+      </plugin>
       <!-- Scalatest runs all Scala tests -->
       <plugin>
         <groupId>org.scalatest</groupId>
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
index 368a31c0876e..60e1c697490b 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
@@ -29,9 +29,8 @@ import scala.collection.JavaConverters._
 
 /**
  * Start ps scheduler/server in a new process
- * @author Yizhi Liu
  */
-object ParameterServer {
+private[mxnet] object ParameterServer {
   private val logger: Logger = LoggerFactory.getLogger(classOf[ParameterServer])
   def main(args: Array[String]): Unit = {
     val cmdLine = new CommandLine
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/transformer/MXNet.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/transformer/MXNet.scala
new file mode 100644
index 000000000000..7ed9c35b9fc5
--- /dev/null
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/transformer/MXNet.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.spark.transformer
+
+import java.util.UUID
+
+import ml.dmlc.mxnet.spark.{MXNetModel, MXNetParams}
+import ml.dmlc.mxnet.{Context, Shape, Symbol}
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
+import org.apache.spark.ml.{PredictionModel, Predictor}
+import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.sql.DataFrame
+import org.slf4j.{Logger, LoggerFactory}
+
+
+class MXNet extends Predictor[Vector, MXNet, MXNetModelWrap] {
+
+  private val logger: Logger = LoggerFactory.getLogger(classOf[MXNet])
+  private val p: MXNetParams = new MXNetParams
+  private var _featuresCol: String = _
+  private var _labelCol: String = _
+
+  override val uid = UUID.randomUUID().toString
+
+  override def train(dataset: DataFrame) : MXNetModelWrap = {
+    val lps = dataset.select(getFeaturesCol, getLabelCol).rdd
+      .map(row => new LabeledPoint(row.getAs[Double](getLabelCol),
+        row.getAs[Vector](getFeaturesCol)))
+    val mxNet = new ml.dmlc.mxnet.spark.MXNet()
+      .setBatchSize(p.batchSize)
+      .setLabelName(p.labelName)
+      .setContext(p.context)
+      .setDimension(p.dimension)
+      .setNetwork(p.getNetwork)
+      .setNumEpoch(p.numEpoch)
+      .setNumServer(p.numServer)
+      .setNumWorker(p.numWorker)
+      .setExecutorJars(p.jars.mkString(","))
+    val fitted = mxNet.fit(lps)
+    new MXNetModelWrap(lps.sparkContext, fitted, uid)
+  }
+
+  override def copy(extra: ParamMap) : MXNet = defaultCopy(extra)
+
+  def setBatchSize(batchSize: Int): this.type = {
+    p.batchSize = batchSize
+    this
+  }
+
+  def setNumEpoch(numEpoch: Int): this.type = {
+    p.numEpoch = numEpoch
+    this
+  }
+
+  def setDimension(dimension: Shape): this.type = {
+    p.dimension = dimension
+    this
+  }
+
+  def setNetwork(network: Symbol): this.type = {
+    p.setNetwork(network)
+    this
+  }
+
+  def setContext(ctx: Array[Context]): this.type = {
+    p.context = ctx
+    this
+  }
+
+  def setNumWorker(numWorker: Int): this.type = {
+    p.numWorker = numWorker
+    this
+  }
+
+  def setNumServer(numServer: Int): this.type = {
+    p.numServer = numServer
+    this
+  }
+
+  def setDataName(name: String): this.type = {
+    p.dataName = name
+    this
+  }
+
+  def setLabelName(name: String): this.type = {
+    p.labelName = name
+    this
+  }
+
+  /**
+    * The application (including parameter scheduler & servers)
+    * will exist if it hasn't received heart beat for over timeout seconds
+    * @param timeout timeout in seconds (default 300)
+    */
+  def setTimeout(timeout: Int): this.type = {
+    p.timeout = timeout
+    this
+  }
+
+  /**
+    * These jars are required by the KVStores at runtime.
+    * They will be uploaded and distributed to each node automatically
+    * @param jars jars required by the KVStore at runtime.
+    */
+  def setExecutorJars(jars: String): this.type = {
+    p.jars = jars.split(",|:")
+    this
+  }
+
+  def setJava(java: String): this.type = {
+    p.javabin = java
+    this
+  }
+
+}
+
+class MXNetModelWrap(sc: SparkContext, mxNet: MXNetModel, uuid: String)
+  extends PredictionModel[Vector, MXNetModelWrap] with Serializable with MLWritable {
+
+  override def copy(extra: ParamMap): MXNetModelWrap = {
+    copyValues(new MXNetModelWrap(sc, mxNet, uuid)).setParent(parent)
+  }
+
+  override val uid: String = uuid
+
+  override def predict(features: Vector) : Double = {
+    val probArrays = mxNet.predict(features)
+    val prob = probArrays(0)
+    val arr = prob.get.toArray
+    if (arr.length == 1) {
+      arr(0)
+    } else {
+      arr.indexOf(arr.max)
+    }
+
+  }
+
+  protected[MXNetModelWrap] class MXNetModelWriter(instance: MXNetModelWrap) extends MLWriter {
+    override protected def saveImpl(path: String): Unit = {
+      mxNet.save(sc, path)
+    }
+  }
+
+  override def write: MLWriter = new MXNetModelWriter(this)
+
+  object MXNetModelWrap extends MLReadable[MXNetModel] {
+    override def read: MLReader[MXNetModel] = new MXNetModelReader
+    override def load(path: String): MXNetModel = super.load(path)
+    private class MXNetModelReader extends MLReader[MXNetModel] {
+      override def load(path: String): MXNetModel = MXNetModel.load(sc, path)
+    }
+  }
+
+}
diff --git a/setup-utils/install-mxnet-fedora-python.sh b/setup-utils/install-mxnet-fedora-python.sh
index b957062f8a1a..54b716b911db 100644
--- a/setup-utils/install-mxnet-fedora-python.sh
+++ b/setup-utils/install-mxnet-fedora-python.sh
@@ -15,6 +15,7 @@ sudo yum install -y atlas atlas-devel opencv opencv-devel graphviz graphviz-deve
 
 echo "Building MXNet core. This can take few minutes..."
 cd "$MXNET_HOME"
+cp make/config.mk .
 make -j$(nproc)
 
 echo "Installing Numpy..."
diff --git a/setup-utils/install-mxnet-ubuntu-python.sh b/setup-utils/install-mxnet-ubuntu-python.sh
index 345b669498e8..36f72102f40e 100644
--- a/setup-utils/install-mxnet-ubuntu-python.sh
+++ b/setup-utils/install-mxnet-ubuntu-python.sh
@@ -14,6 +14,7 @@ sudo apt-get install -y build-essential libatlas-base-dev libopencv-dev graphviz
 
 echo "Building MXNet core. This can take few minutes..."
 cd "$MXNET_HOME"
+cp make/config.mk .
 make -j$(nproc)
 
 echo "Installing Numpy..."
diff --git a/setup-utils/install-mxnet-ubuntu-r.sh b/setup-utils/install-mxnet-ubuntu-r.sh
index 06897484de77..ba8296cb49f3 100644
--- a/setup-utils/install-mxnet-ubuntu-r.sh
+++ b/setup-utils/install-mxnet-ubuntu-r.sh
@@ -13,6 +13,7 @@ echo "MXNet root folder: $MXNET_HOME"
 
 echo "Building MXNet core. This can take few minutes..."
 cd "$MXNET_HOME"
+cp make/config.mk .
 make -j$(nproc)
 
 echo "Installing R dependencies. This can take few minutes..."
@@ -32,6 +33,9 @@ sudo apt-get -y install libcurl4-openssl-dev libssl-dev
 # Needed for R XML
 sudo apt-get install libxml2-dev
 
+# Needed for R Cairo 
+sudo apt-get install libxt-dev
+
 sudo Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
 cd R-package
 sudo Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
diff --git a/snap.python b/snap.python
new file mode 100755
index 000000000000..cf54f7d46f9c
--- /dev/null
+++ b/snap.python
@@ -0,0 +1,5 @@
+export MXNET_HOME=$SNAP
+export LD_LIBRARY_PATH=${SNAP}/lib:${SNAP}/usr/lib:$LD_LIBRARY_PATH 
+export PYTHONPATH=$MXNET_HOME:${SNAP}/lib/python2.7/site-packages/:${SNAP}/usr/lib/python2.7/dist-packages/:$PYTHONPATH 
+
+exec ${SNAP}/usr/bin/python $@
diff --git a/snapcraft.yaml b/snapcraft.yaml
new file mode 100644
index 000000000000..65713447dc64
--- /dev/null
+++ b/snapcraft.yaml
@@ -0,0 +1,62 @@
+name: mxnet
+version: '0.9.5'
+summary: MXNet is a deep learning framework designed for efficiency and flexibility.
+description: |
+  MXNet is a deep learning framework designed for both efficiency and 
+  flexibility. It allows you to mix the flavours of symbolic programming and 
+  imperative programming to maximize efficiency and productivity. In its core, 
+  a dynamic dependency scheduler that automatically parallelizes both symbolic 
+  and imperative operations on the fly. A graph optimization layer on top of 
+  that makes symbolic execution fast and memory efficient. The library is 
+  portable and lightweight, and it scales to multiple GPUs and multiple machines.
+
+grade: stable
+confinement: strict
+
+apps:
+  python:
+    command: snap.python
+
+parts:
+  mxnet:
+    source: .
+    plugin: make
+    build-packages:
+      - build-essential
+      - libatlas-base-dev
+      - libopencv-dev
+    stage-packages:
+      - libatlas3-base
+      - libopencv-calib3d2.4v5
+      - libopencv-core2.4v5
+      - libopencv-highgui2.4v5
+      - libopencv-imgproc2.4v5
+      - libopencv-ml2.4v5
+      - libopencv-objdetect2.4v5
+    prepare: |
+      cp make/config.mk .
+    build: |
+      make
+    install: |
+      cp -r bin $SNAPCRAFT_PART_INSTALL/
+      cp -r lib $SNAPCRAFT_PART_INSTALL/
+    
+  mxnet-ubuntu-python:
+    plugin: python
+    python-version: python2
+    source: ./python
+    stage-packages:
+      - python-numpy
+    python-packages:
+      - graphviz
+      - Jupyter
+    after: [mxnet]
+    
+  python-wrapper:
+    plugin: dump
+    source: .
+    stage:
+      - snap.python
+    prime:
+      - snap.python
+
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 56ffb9754c4f..154eb1bc7969 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -23,7 +23,7 @@
 #include <functional>
 #include <utility>
 #include "./c_api_common.h"
-#include "../operator/custom-inl.h"
+#include "../operator/custom/custom-inl.h"
 #include "../engine/profiler.h"
 
 using namespace mxnet;
@@ -318,7 +318,7 @@ int MXNDArrayGetShape(NDArrayHandle handle,
 }
 
 int MXNDArrayGetData(NDArrayHandle handle,
-                     mx_float **out_pdata) {
+                     void **out_pdata) {
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
   if (!arr->is_none()) {
@@ -326,7 +326,9 @@ int MXNDArrayGetData(NDArrayHandle handle,
         << "MXNDArrayGetData can only be called for NDArray on CPU";
     const TBlob &b = arr->data();
     CHECK(b.CheckContiguous());
-    *out_pdata = b.FlatTo2D<cpu, mx_float>().dptr_;
+    MSHADOW_REAL_TYPE_SWITCH(arr->dtype(), DType, {
+      *out_pdata = b.FlatTo2D<cpu, DType>().dptr_;
+    });
   } else {
     *out_pdata = nullptr;
   }
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index bdf5aa794f14..7d9d8d61bb77 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -16,30 +16,14 @@
 
 using namespace mxnet;
 
-int MXImperativeInvoke(AtomicSymbolCreator creator,
-                       int num_inputs,
-                       NDArrayHandle *inputs,
-                       int *num_outputs,
-                       NDArrayHandle **outputs,
-                       int num_params,
-                       const char **param_keys,
-                       const char **param_vals) {
+void SetOpAttrs(const nnvm::Op *op,
+                nnvm::NodeAttrs *p_attrs,
+                const int& num_inputs,
+                const int& num_params,
+                const char **param_keys,
+                const char **param_vals) {
   static auto& num_args = nnvm::Op::GetAttr<std::string>("key_var_num_args");
-  static auto& infershape = nnvm::Op::GetAttr<nnvm::FInferShape>("FInferShape");
-  static auto& infertype = nnvm::Op::GetAttr<nnvm::FInferType>("FInferType");
-  static auto& visible_out = nnvm::Op::GetAttr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs");
-  static auto& fcpu = nnvm::Op::GetAttr<FCompute>("FCompute<cpu>");
-  static auto& fgpu = nnvm::Op::GetAttr<FCompute>("FCompute<gpu>");
-  static auto& ndfunc = nnvm::Op::GetAttr<FNDArrayFunction>("FNDArrayFunction");
-  static auto& createop = nnvm::Op::GetAttr<FCreateLayerOp>("FCreateLayerOp");
-  static auto& mutate = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
-  static auto& tmp_resource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
-  const nnvm::Op* op = static_cast<nnvm::Op*>(creator);
-  NDArray** outarray = *reinterpret_cast<NDArray***>(outputs);
-  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
-
-  API_BEGIN();
-  nnvm::NodeAttrs attrs;
+  nnvm::NodeAttrs& attrs = *p_attrs;
   attrs.op = op;
   for (int i = 0; i < num_params; ++i) {
     attrs.dict.emplace(param_keys[i], param_vals[i]);
@@ -51,6 +35,14 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
   if (op->attr_parser != nullptr) {
     op->attr_parser(&attrs);
   }
+}
+
+void SetNumOutputs(const nnvm::Op *op,
+                   const nnvm::NodeAttrs& attrs,
+                   const int& num_inputs,
+                   int* infered_num_outputs,
+                   int* num_visible_outputs) {
+  static auto& visible_out = nnvm::Op::GetAttr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs");
   int infered_num_inputs;
   if (op->get_num_inputs != nullptr) {
     infered_num_inputs = op->get_num_inputs(attrs);
@@ -60,19 +52,29 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
   CHECK_EQ(num_inputs, infered_num_inputs)
     << "Expecting " << infered_num_inputs << " inputs, got "
     << num_inputs << " in operator " << op->name;
-  int infered_num_outputs;
   if (op->get_num_outputs != nullptr) {
-    infered_num_outputs = op->get_num_outputs(attrs);
+    *infered_num_outputs = op->get_num_outputs(attrs);
   } else {
-    infered_num_outputs = op->num_outputs;
+    *infered_num_outputs = op->num_outputs;
   }
-  int num_visible_outputs = infered_num_outputs;
+  *num_visible_outputs = *infered_num_outputs;
   if (visible_out.count(op)) {
-    num_visible_outputs = visible_out[op](attrs);
-    CHECK_LE(num_visible_outputs, infered_num_outputs);
+    *num_visible_outputs = visible_out[op](attrs);
+    CHECK_LE(*num_visible_outputs, *infered_num_outputs);
   }
+}
 
-  std::vector<NDArray> ndinputs, ndoutputs;
+void SetNDInputsOutputs(const nnvm::Op* op,
+                        std::vector<NDArray>* p_ndinputs,
+                        std::vector<NDArray>* p_ndoutputs,
+                        const int& num_inputs,
+                        const NDArrayHandle *inputs,
+                        int *num_outputs,
+                        const int& infered_num_outputs,
+                        const int& num_visible_outputs,
+                        NDArray** outarray) {
+  std::vector<NDArray>& ndinputs  = *p_ndinputs;
+  std::vector<NDArray>& ndoutputs = *p_ndoutputs;
   ndinputs.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
     ndinputs.emplace_back(*reinterpret_cast<NDArray*>(inputs[i]));
@@ -91,108 +93,276 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
     }
     ndoutputs.resize(infered_num_outputs);
   }
+}
 
-  if (ndfunc.count(op)) {
-    ndfunc[op](attrs, ndinputs, &ndoutputs);
+void SetContext(Context* p_ctx,
+                const nnvm::NodeAttrs& attrs,
+                const int& num_inputs,
+                const std::vector<NDArray>& ndinputs,
+                const int& infered_num_outputs,
+                const std::vector<NDArray>& ndoutputs) {
+  Context& ctx = *p_ctx;
+  if (num_inputs) {
+    ctx = ndinputs[0].ctx();
+  } else if (infered_num_outputs && !ndoutputs[0].is_none()) {
+    ctx = ndoutputs[0].ctx();
+  } else if (attrs.dict.find("ctx") != attrs.dict.end()) {
+    ctx = Context::FromString(attrs.dict.at("ctx"));
   } else {
-    // TODO(piiswrong): infer ctx
-    Context ctx;
-    if (num_inputs) {
-      ctx = ndinputs[0].ctx();
-    } else if (infered_num_outputs && !ndoutputs[0].is_none()) {
-      ctx = ndoutputs[0].ctx();
-    } else if (attrs.dict.find("ctx") != attrs.dict.end()) {
-      ctx = Context::FromString(attrs.dict["ctx"]);
-    } else {
-      ctx = Context::CPU();
-    }
-    // Pinned context doesn't propagate
-    if (ctx.dev_type == Context::kCPUPinned) {
-      ctx = Context::CPU();
-    }
+    ctx = Context::CPU();
+  }
+  // Pinned context doesn't propagate
+  if (ctx.dev_type == Context::kCPUPinned) {
+    ctx = Context::CPU();
+  }
+}
+
+void SetShapeType(const nnvm::Op* op,
+                  const nnvm::NodeAttrs& attrs,
+                  const Context& ctx,
+                  const std::vector<NDArray>& ndinputs,
+                  const int& infered_num_outputs,
+                  std::vector<NDArray>* p_ndoutputs) {
+  std::vector<NDArray>& ndoutputs = *p_ndoutputs;
+  static auto& infershape = nnvm::Op::GetAttr<nnvm::FInferShape>("FInferShape");
+  static auto& infertype = nnvm::Op::GetAttr<nnvm::FInferType>("FInferType");
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  // infer shape
+  std::vector<TShape>& in_shapes  = ret->arg_shapes;
+  std::vector<TShape>& out_shapes = ret->out_shapes;
+  in_shapes.clear();
+  out_shapes.clear();
 
-    std::vector<TShape>& in_shapes = ret->arg_shapes;
-    std::vector<TShape>& out_shapes = ret->out_shapes;
-    in_shapes.clear();
-    out_shapes.clear();
+  for (auto& i : ndinputs) {
+    in_shapes.emplace_back(i.shape());
+  }
+  for (auto& i : ndoutputs) {
+    out_shapes.emplace_back(i.shape());
+  }
+  CHECK(infershape.count(op))
+    << "Operator " << op->name << " is missing FInferShape attribute";
+  CHECK(infershape[op](attrs, &in_shapes, &out_shapes));
+  CHECK_EQ(out_shapes.size(), static_cast<size_t>(infered_num_outputs));
 
-    for (auto& i : ndinputs) {
-      in_shapes.emplace_back(i.shape());
-    }
-    for (auto& i : ndoutputs) {
-      out_shapes.emplace_back(i.shape());
+  // infer type
+  std::vector<int>& in_types = ret->arg_types;
+  std::vector<int>& out_types = ret->out_types;
+  in_types.clear();
+  out_types.clear();
+
+  for (auto& i : ndinputs) {
+    in_types.push_back(i.dtype());
+  }
+  for (auto& i : ndoutputs) {
+    out_types.push_back(i.dtype());
+  }
+  CHECK(infertype.count(op))
+    << "Operator " << op->name << " is missing FInferType attribute";
+  CHECK(infertype[op](attrs, &in_types, &out_types));
+  CHECK_EQ(out_types.size(), static_cast<size_t>(infered_num_outputs));
+
+  for (int i = 0; i < infered_num_outputs; ++i) {
+    if (ndoutputs[i].is_none()) {
+      ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
+    } else {
+      CHECK_EQ(ndoutputs[i].shape(), out_shapes[i])
+        << i << "th output has invalid shape. "
+        << "Expecting " << out_shapes[i] << " got "
+        << ndoutputs[i].shape() << " in operator " << op->name;
+      CHECK_EQ(ndoutputs[i].dtype(), out_types[i])
+        << i << "th output has invalid shape. "
+        << "Expecting " << out_types[i] << " got "
+        << ndoutputs[i].dtype()  << " in operator " << op->name;
     }
-    CHECK(infershape.count(op))
-      << "Operator " << op->name << " is missing FInferShape attribute";
-    CHECK(infershape[op](attrs, &in_shapes, &out_shapes));
-    CHECK_EQ(out_shapes.size(), static_cast<size_t>(infered_num_outputs));
+  }
+}
+
+void SetDependency(std::vector<engine::VarHandle> *p_read_vars,
+                   std::vector<engine::VarHandle> *p_write_vars,
+                   std::vector<Resource> *p_requested,
+                   std::vector<uint32_t> *p_auxidx,
+                   const nnvm::Op* op,
+                   const nnvm::NodeAttrs& attrs,
+                   const Context& ctx,
+                   const std::vector<NDArray>& ndinputs,
+                   const std::vector<NDArray>& ndoutputs) {
+  static auto& mutate = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
+  static auto& tmp_resource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
 
-    std::vector<int>& in_types = ret->arg_types;
-    std::vector<int>& out_types = ret->out_types;
-    in_types.clear();
-    out_types.clear();
+  std::vector<engine::VarHandle>& read_vars  = *p_read_vars;
+  std::vector<engine::VarHandle>& write_vars = *p_write_vars;
+  std::vector<Resource>& requested = *p_requested;
+  std::vector<uint32_t>& auxidx = *p_auxidx;
 
-    for (auto& i : ndinputs) {
-      in_types.push_back(i.dtype());
+  if (tmp_resource.count(op)) {
+    int ntmp = 0;
+    for (const auto& req : tmp_resource[op](attrs)) {
+      switch (req.type) {
+       case ResourceRequest::kTempSpace:
+        ++ntmp;
+       case ResourceRequest::kRandom:
+        requested.push_back(ResourceManager::Get()->Request(ctx, req));
+        write_vars.push_back(requested.back().var);
+        break;
+       default:
+        LOG(FATAL) << "resource type not yet supported";
+      }
     }
-    for (auto& i : ndoutputs) {
-      out_types.push_back(i.dtype());
+    CHECK_LE(ntmp, 1) << "Only support 1 temp space request";
+  }
+
+  for (auto& i : ndinputs) {
+    read_vars.push_back(i.var());
+  }
+  for (auto& i : ndoutputs) {
+    write_vars.push_back(i.var());
+  }
+  if (mutate.count(op)) {
+    auxidx = mutate[op](attrs);
+    std::sort(auxidx.begin(), auxidx.end());
+    for (auto & i : auxidx) {
+      write_vars.push_back(ndinputs[i].var());
     }
-    CHECK(infertype.count(op))
-      << "Operator " << op->name << " is missing FInferType attribute";
-    CHECK(infertype[op](attrs, &in_types, &out_types));
-    CHECK_EQ(out_types.size(), static_cast<size_t>(infered_num_outputs));
+  }
+  common::DeduplicateVarHandle(&read_vars, &write_vars);
+}
 
-    for (int i = 0; i < infered_num_outputs; ++i) {
-      if (ndoutputs[i].is_none()) {
-        ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
-      } else {
-        CHECK_EQ(ndoutputs[i].shape(), out_shapes[i])
-          << i << "th output has invalid shape. "
-          << "Expecting " << out_shapes[i] << " got "
-          << ndoutputs[i].shape() << " in operator " << op->name;
-        CHECK_EQ(ndoutputs[i].dtype(), out_types[i])
-          << i << "th output has invalid shape. "
-          << "Expecting " << out_types[i] << " got "
-          << ndoutputs[i].dtype()  << " in operator " << op->name;
+void PushFCompute(const FCompute& fn,
+                  const nnvm::Op* op,
+                  const nnvm::NodeAttrs& attrs,
+                  const Context& ctx,
+                  const std::vector<engine::VarHandle>& read_vars,
+                  const std::vector<engine::VarHandle>& write_vars,
+                  const std::vector<Resource>& requested,
+                  const std::vector<NDArray>& ndinputs,
+                  const std::vector<NDArray>& ndoutputs) {
+  Engine::Get()->PushAsync(
+    [ctx, attrs, fn, ndinputs, ndoutputs, requested](
+        RunContext rctx,
+        engine::CallbackOnComplete on_complete) {
+      std::vector<TBlob> input_blobs, output_blobs;
+      for (auto& i : ndinputs) {
+        input_blobs.push_back(i.data());
       }
-    }
+      for (auto& i : ndoutputs) {
+        i.CheckAndAlloc();
+        output_blobs.push_back(i.data());
+      }
+      OpContext opctx{false, rctx,
+                      engine::CallbackOnComplete(),
+                      requested};
+      std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
+      fn(attrs, opctx, input_blobs, req, output_blobs);
+      if (ctx.dev_mask() == gpu::kDevMask) {
+        rctx.get_stream<gpu>()->Wait();
+      }
+      on_complete();
+    }, ctx, read_vars, write_vars, FnProperty::kNormal,
+    0, PROFILER_MESSAGE(op->name.c_str()));
+}
 
-    std::vector<engine::VarHandle> read_vars, write_vars;
-    // request resources
-    std::vector<Resource> requested;
-    if (tmp_resource.count(op)) {
-      int ntmp = 0;
-      for (const auto& req : tmp_resource[op](attrs)) {
-        switch (req.type) {
-         case ResourceRequest::kTempSpace:
-          ++ntmp;
-         case ResourceRequest::kRandom:
-          requested.push_back(ResourceManager::Get()->Request(ctx, req));
-          write_vars.push_back(requested.back().var);
-          break;
-         default:
-          LOG(FATAL) << "resource type not yet supported";
+void PushOperator(const nnvm::Op* op,
+                  const nnvm::NodeAttrs& attrs,
+                  const Context& ctx,
+                  const std::vector<engine::VarHandle>& read_vars,
+                  const std::vector<engine::VarHandle>& write_vars,
+                  const std::vector<Resource>& requested,
+                  const std::vector<uint32_t>& auxidx,
+                  const std::vector<NDArray>& ndinputs,
+                  const std::vector<NDArray>& ndoutputs) {
+  static auto& createop = nnvm::Op::GetAttr<FCreateLayerOp>("FCreateLayerOp");
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  Operator* opr = createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
+  struct Capture {
+    engine::CallbackOnComplete on_complete;
+    Operator *opr;
+  };
+
+  Engine::Get()->PushAsync(
+    [ctx, opr, auxidx, ndinputs, ndoutputs, requested](
+        RunContext rctx,
+        engine::CallbackOnComplete on_complete) {
+      std::vector<TBlob> input_blobs, aux_blobs, output_blobs;
+      auto atop = auxidx.begin();
+      for (size_t i = 0; i < ndinputs.size(); ++i) {
+        if (atop != auxidx.end() && i == *atop) {
+          aux_blobs.push_back(ndinputs[i].data());
+          ++atop;
+        } else {
+          input_blobs.push_back(ndinputs[i].data());
         }
       }
-      CHECK_LE(ntmp, 1) << "Only support 1 temp space request";
-    }
+      for (auto& i : ndoutputs) {
+        i.CheckAndAlloc();
+        output_blobs.push_back(i.data());
+      }
+      Capture* capture = new Capture({on_complete, opr});
+      OpContext opctx{false, rctx,
+                      Engine::Get()->CreateCallback(
+                        [](Engine* engine, void *cpt_handle) {
+                            Capture* cpt = static_cast<Capture*>(cpt_handle);
+                            cpt->on_complete();
+                            delete cpt->opr;
+                            delete cpt;
+                          }, static_cast<void*>(capture)),
+                      requested};
+      std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
+      opr->Forward(opctx, input_blobs, req, output_blobs, aux_blobs);
+      if (opr->exec_type() != Operator::kAsync) {
+        if (ctx.dev_mask() == gpu::kDevMask) {
+          rctx.get_stream<gpu>()->Wait();
+        }
+        delete opr;
+        delete capture;
+        on_complete();
+      }
+    }, ctx, read_vars, write_vars, FnProperty::kNormal,
+    0, PROFILER_MESSAGE(op->name.c_str()));
+}
 
+int MXImperativeInvoke(AtomicSymbolCreator creator,
+                       int num_inputs,
+                       NDArrayHandle *inputs,
+                       int *num_outputs,
+                       NDArrayHandle **outputs,
+                       int num_params,
+                       const char **param_keys,
+                       const char **param_vals) {
+  static auto& fcpu = nnvm::Op::GetAttr<FCompute>("FCompute<cpu>");
+  static auto& fgpu = nnvm::Op::GetAttr<FCompute>("FCompute<gpu>");
+  static auto& ndfunc = nnvm::Op::GetAttr<FNDArrayFunction>("FNDArrayFunction");
+  static auto& createop = nnvm::Op::GetAttr<FCreateLayerOp>("FCreateLayerOp");
+  const nnvm::Op* op = static_cast<nnvm::Op*>(creator);
+  NDArray** outarray = *reinterpret_cast<NDArray***>(outputs);
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+
+  API_BEGIN();
+  nnvm::NodeAttrs attrs;
+  SetOpAttrs(op, &attrs,
+      num_inputs, num_params, param_keys, param_vals);
+
+  int infered_num_outputs;
+  int num_visible_outputs;
+  SetNumOutputs(op, attrs, num_inputs,
+      &infered_num_outputs, &num_visible_outputs);
+
+  std::vector<NDArray> ndinputs, ndoutputs;
+  SetNDInputsOutputs(op, &ndinputs, &ndoutputs, num_inputs, inputs,
+      num_outputs, infered_num_outputs, num_visible_outputs, outarray);
+
+  if (ndfunc.count(op)) {
+    ndfunc[op](attrs, ndinputs, &ndoutputs);
+  } else {
+    // TODO(piiswrong): infer ctx
+    Context ctx;
+    SetContext(&ctx, attrs, num_inputs, ndinputs, infered_num_outputs, ndoutputs);
+    SetShapeType(op, attrs, ctx, ndinputs, infered_num_outputs, &ndoutputs);
+
+    std::vector<engine::VarHandle> read_vars, write_vars;
+    std::vector<Resource> requested;
     std::vector<uint32_t> auxidx;
-    for (auto& i : ndinputs) {
-      read_vars.push_back(i.var());
-    }
-    for (auto& i : ndoutputs) {
-      write_vars.push_back(i.var());
-    }
-    if (mutate.count(op)) {
-      auxidx = mutate[op](attrs);
-      std::sort(auxidx.begin(), auxidx.end());
-      for (auto & i : auxidx) {
-        write_vars.push_back(ndinputs[i].var());
-      }
-    }
-    common::DeduplicateVarHandle(&read_vars, &write_vars);
+    SetDependency(&read_vars, &write_vars, &requested, &auxidx,
+        op, attrs, ctx, ndinputs, ndoutputs);
 
     FCompute fn;
     if (ctx.dev_mask() == cpu::kDevMask && fcpu.count(op)) {
@@ -200,76 +370,13 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
     } else if (ctx.dev_mask() == gpu::kDevMask && fgpu.count(op)) {
       fn = fgpu[op];
     }
+
     if (fn) {
-      Engine::Get()->PushAsync(
-        [ctx, attrs, fn, ndinputs, ndoutputs, requested](
-            RunContext rctx,
-            engine::CallbackOnComplete on_complete) {
-          std::vector<TBlob> input_blobs, output_blobs;
-          for (auto& i : ndinputs) {
-            input_blobs.push_back(i.data());
-          }
-          for (auto& i : ndoutputs) {
-            i.CheckAndAlloc();
-            output_blobs.push_back(i.data());
-          }
-          OpContext opctx{false, rctx,
-                          engine::CallbackOnComplete(),
-                          requested};
-          std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
-          fn(attrs, opctx, input_blobs, req, output_blobs);
-          if (ctx.dev_mask() == gpu::kDevMask) {
-            rctx.get_stream<gpu>()->Wait();
-          }
-          on_complete();
-        }, ctx, read_vars, write_vars, FnProperty::kNormal,
-        0, PROFILER_MESSAGE(op->name.c_str()));
+      PushFCompute(fn, op, attrs, ctx, read_vars, write_vars,
+          requested, ndinputs, ndoutputs);
     } else if (createop.count(op)) {
-      Operator* opr = createop[op](attrs, ctx, in_shapes, in_types);
-      struct Capture {
-        engine::CallbackOnComplete on_complete;
-        Operator *opr;
-      };
-      Engine::Get()->PushAsync(
-        [ctx, opr, auxidx, ndinputs, ndoutputs, requested](
-            RunContext rctx,
-            engine::CallbackOnComplete on_complete) {
-          std::vector<TBlob> input_blobs, aux_blobs, output_blobs;
-          auto atop = auxidx.begin();
-          for (size_t i = 0; i < ndinputs.size(); ++i) {
-            if (atop != auxidx.end() && i == *atop) {
-              aux_blobs.push_back(ndinputs[i].data());
-              ++atop;
-            } else {
-              input_blobs.push_back(ndinputs[i].data());
-            }
-          }
-          for (auto& i : ndoutputs) {
-            i.CheckAndAlloc();
-            output_blobs.push_back(i.data());
-          }
-          Capture* capture = new Capture({on_complete, opr});
-          OpContext opctx{false, rctx,
-                          Engine::Get()->CreateCallback(
-                            [](Engine* engine, void *cpt_handle) {
-                                Capture* cpt = static_cast<Capture*>(cpt_handle);
-                                cpt->on_complete();
-                                delete cpt->opr;
-                                delete cpt;
-                              }, static_cast<void*>(capture)),
-                          requested};
-          std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
-          opr->Forward(opctx, input_blobs, req, output_blobs, aux_blobs);
-          if (opr->exec_type() != Operator::kAsync) {
-            if (ctx.dev_mask() == gpu::kDevMask) {
-              rctx.get_stream<gpu>()->Wait();
-            }
-            delete opr;
-            delete capture;
-            on_complete();
-          }
-        }, ctx, read_vars, write_vars, FnProperty::kNormal,
-        0, PROFILER_MESSAGE(op->name.c_str()));
+      PushOperator(op, attrs, ctx, read_vars, write_vars,
+          requested, auxidx, ndinputs, ndoutputs);
     } else {
       LOG(FATAL)
         << "Operator " << op->name
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 37698125edfb..f7281c999e6a 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -142,7 +142,20 @@ int MXSymbolGetOutput(SymbolHandle symbol,
 
 int MXSymbolGetInternals(SymbolHandle symbol,
                          SymbolHandle *out) {
-  return NNSymbolGetInternals(symbol, out);
+  nnvm::Symbol *s = new nnvm::Symbol();
+  API_BEGIN();
+  *s = static_cast<nnvm::Symbol*>(symbol)->GetInternals();
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
+}
+
+int MXSymbolGetChildren(SymbolHandle symbol,
+                        SymbolHandle *out) {
+  nnvm::Symbol *s = new nnvm::Symbol();
+  API_BEGIN();
+  *s = static_cast<nnvm::Symbol*>(symbol)->GetChildren();
+  *out = s;
+  API_END_HANDLE_ERROR(delete s);
 }
 
 int MXSymbolFree(SymbolHandle symbol) {
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 51cb062e3227..efb7bd44981b 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -31,13 +31,6 @@ class NaiveEngine final : public Engine {
   }
   // virtual destructor
   virtual ~NaiveEngine() {
-#if MXNET_USE_PROFILER
-  // dump trace file if profiler is enabled when engine is destructed.
-  Profiler* profiler = Profiler::Get();
-  if (profiler->IsEnableOutput()) {
-    profiler->DumpProfile();
-  }
-#endif
 #if MXNET_USE_CUDA
     LOG(INFO) << "Engine shutdown";
     for (size_t i = 0; i < streams_.size(); ++i) {
diff --git a/src/engine/profiler.cc b/src/engine/profiler.cc
index e4e95e52a705..44099c397783 100644
--- a/src/engine/profiler.cc
+++ b/src/engine/profiler.cc
@@ -5,6 +5,7 @@
  */
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
+#include <mxnet/base.h>
 #include <set>
 #include <map>
 #include <mutex>
@@ -13,13 +14,12 @@
 #include <fstream>
 #include "./profiler.h"
 
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+#include <Windows.h>
+#endif
+
 namespace mxnet {
 namespace engine {
-#if MXNET_USE_PROFILER
-Profiler* Profiler::instance_ = new Profiler();
-#else
-Profiler* Profiler::instance_ = nullptr;
-#endif
 const int INITIAL_SIZE = 1024;
 
 Profiler::Profiler()
@@ -54,7 +54,12 @@ Profiler::Profiler()
 }
 
 Profiler* Profiler::Get() {
-  return instance_;
+#if MXNET_USE_PROFILER
+  static Profiler inst;
+  return &inst;
+#else
+  return nullptr;
+#endif
 }
 
 void Profiler::SetState(ProfilerState state) {
@@ -180,8 +185,15 @@ void Profiler::DumpProfile() {
 
 
 inline uint64_t NowInUsec() {
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+  LARGE_INTEGER frequency, counter;
+  QueryPerformanceFrequency(&frequency);
+  QueryPerformanceCounter(&counter);
+  return counter.QuadPart * 1000000 / frequency.QuadPart;
+#else
   return std::chrono::duration_cast<std::chrono::microseconds>(
     std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+#endif
 }
 
 void SetOprStart(OprExecStat* opr_stat) {
diff --git a/src/engine/profiler.h b/src/engine/profiler.h
index 030657c20133..f28d691e250d 100644
--- a/src/engine/profiler.h
+++ b/src/engine/profiler.h
@@ -6,10 +6,10 @@
 #ifndef MXNET_ENGINE_PROFILER_H_
 #define MXNET_ENGINE_PROFILER_H_
 
-#include <mxnet/engine.h>
 #include <vector>
 #include <string>
 #include <mutex>
+#include <memory>
 
 namespace mxnet {
 namespace engine {
@@ -68,8 +68,6 @@ class Profiler {
       kNotRunning = 0,
       kRunning = 1
   };
-  /*! \return Profiler singleton */
-  static Profiler* Get();
   /*! \brief set state of profiler */
   void SetState(ProfilerState state);
   /*! \return state of profiler */
@@ -95,6 +93,8 @@ class Profiler {
   /*! \brief add one operation execution record in
    *   corresponding device statistics */
   OprExecStat* AddOprStat(int dev_type, uint32_t dev_id);
+  /*! \return Profiler singleton */
+  static Profiler* Get();
 
  protected:
   /*! \brief make constructor protected. */
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 14c4466dd452..4612cc6e02bf 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -268,13 +268,6 @@ class ThreadedEngine : public Engine {
     objpool_var_ref_    = common::ObjectPool<ThreadedVar>::_GetSharedRef();
   }
   ~ThreadedEngine() {
-#if MXNET_USE_PROFILER
-    // dump trace file if profiler is enabled when engine is destructed.
-    Profiler* profiler = Profiler::Get();
-    if (profiler->IsEnableOutput()) {
-      profiler->DumpProfile();
-    }
-#endif
     {
       std::unique_lock<std::mutex> lock{finished_m_};
       kill_.store(true);
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 9ca6bad7f813..71fb63582a83 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -196,7 +196,7 @@ Graph AttachOpExecs(Graph g) {
               inode.source->attrs, vctx[i], ishape, itype), mutate_index);
     } else if (is_layer_backward.get(inode.source->op(), false)) {
       uint32_t fwd_id = inode.control_deps[0];
-      CHECK_GE(inode.control_deps.size(), 1);
+      CHECK_GE(inode.control_deps.size(), 1U);
       CHECK(vctx[fwd_id] == vctx[i]);
       CHECK(ret[fwd_id] != nullptr);
       ret[i] = std::make_shared<BackwardOpExecutor>(
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index a3d6087f8dd9..e146e3afcaa0 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -21,6 +21,12 @@ GraphExecutor::~GraphExecutor() {
       Engine::Get()->DeleteOperator(n.cached_opr);
     }
   }
+  // clean up seg ops
+  for (auto& seg : cached_seg_opr_) {
+    if (seg.opr != nullptr) {
+      Engine::Get()->DeleteOperator(seg.opr);
+    }
+  }
 }
 
 void GraphExecutor::Forward(bool is_train) {
@@ -85,26 +91,31 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
   static const Op* ewise_sum_op = Op::Get("ElementWiseSum");
   static const Op* identity_op = Op::Get("identity");
   static const Op* zeros_op = Op::Get("_zeros");
-  // remove zero in the sum.
+  static const Op* zeros_like_op = Op::Get("zeros_like");
+
+  if (v.size() == 0) {
+    nnvm::NodePtr ng = nnvm::Node::Create();
+    ng->attrs.op = zeros_op;
+    ng->attrs.name = "zeros";
+    ng->attrs.op->attr_parser(&(ng->attrs));
+    return nnvm::NodeEntry{ng, 0, 0};
+  }
+
+  // remove zero in the sum. at least keep 1.
   size_t begin = 0;
   for (size_t i = 0; i < v.size(); ++i) {
-    if (v[i].node->op() != zeros_op) {
+    if (v[i].node->op() != zeros_op && v[i].node->op() != zeros_like_op) {
       if (begin != i) {
         v[begin] = std::move(v[i]);
       }
       ++begin;
     }
   }
+  if (begin == 0) begin = 1;
   v.resize(begin);
 
   if (v.size() == 1) {
     return std::move(v[0]);
-  } else if (v.size() == 0) {
-    nnvm::NodePtr ng = nnvm::Node::Create();
-    ng->attrs.op = zeros_op;
-    ng->attrs.name = "zeros";
-    ng->attrs.op->attr_parser(&(ng->attrs));
-    return nnvm::NodeEntry{ng, 0, 0};
   } else {
     if (v.size() < inplace_sum_cap) {
       nnvm::NodePtr sum_node = nnvm::Node::Create();
@@ -212,10 +223,16 @@ nnvm::Graph GraphExecutor::InitFullGraph(
     if (type == "Proposal") return false;
     return true;
   };
+
+  std::vector<const nnvm::Op*> zero_ops;
+  zero_ops.push_back(nnvm::Op::Get("zeros_like"));
+  zero_ops.push_back(nnvm::Op::Get("_zeros"));
+
   // take gradient
   nnvm::Graph g_grad = nnvm::pass::Gradient(
       g, symbol.outputs, xs, head_grad_entry_,
-      AggregateGradient, need_mirror);
+      AggregateGradient, need_mirror, nullptr,
+      zero_ops);
   CHECK_EQ(g_grad.outputs.size(), xs.size());
   for (const auto &e : g_grad.outputs) {
     g.outputs.push_back(e);
@@ -330,9 +347,9 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
   g = AttachOpResources(g);
   graph_ = std::move(g);
   if (shared_exec != nullptr) {
-    this->InitDataEntryMemory(dynamic_cast<GraphExecutor*>(shared_exec)->data_pool_);
+    this->InitDataEntryMemory(&(dynamic_cast<GraphExecutor*>(shared_exec)->data_pool_));
   } else {
-    this->InitDataEntryMemory({});
+    this->InitDataEntryMemory(nullptr);
   }
   {
     // initialize output arrays
@@ -350,6 +367,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
     }
   }
   this->InitCachedOps();
+  this->InitOpSegs();
 }
 
 Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
@@ -422,7 +440,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
 }
 
 // initialize the memory of each entries
-void GraphExecutor::InitDataEntryMemory(const std::vector<NDArray>& shared_pool) {
+void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   using nnvm::DTypeVector;
   using nnvm::ShapeVector;
   using nnvm::StorageVector;
@@ -453,7 +471,7 @@ void GraphExecutor::InitDataEntryMemory(const std::vector<NDArray>& shared_pool)
     uint32_t nid = idx.input_nodes().at(i);
     uint32_t oid = head_grad_map_.at(idx[nid].source);
     uint32_t eid = idx.entry_id(idx.outputs()[oid]);
-    CHECK_NE(vshape[eid].ndim(), 0);
+    CHECK_NE(vshape[eid].ndim(), 0U);
     CHECK_NE(vdtype[eid], -1);
     data_entry_[idx.entry_id(nid, 0)] =
         NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
@@ -477,19 +495,33 @@ void GraphExecutor::InitDataEntryMemory(const std::vector<NDArray>& shared_pool)
   }
   // construct the re-use pool, if needed
   std::multimap<size_t, NDArray> free_pool;
-  for (const NDArray& nd : shared_pool) {
-    size_t bytes = nd.shape().Size() * mshadow::mshadow_sizeof(nd.dtype());
-    free_pool.insert(std::make_pair(bytes, nd));
+  if (shared_pool != nullptr) {
+    for (const NDArray& nd : *shared_pool) {
+      size_t bytes = nd.shape().Size() * mshadow::mshadow_sizeof(nd.dtype());
+      free_pool.insert(std::make_pair(bytes, nd));
+    }
   }
   // remake the data pool
   data_pool_.clear();
-  for (size_t i = 0; i < pool_info.size(); ++i) {
+  data_pool_.resize(pool_info.size());
+
+  // sort the pool info the descending order before allocating memory
+  std::vector<size_t> sorted_pool_index;
+  for (size_t i = 0; i < pool_info.size(); i++) {
+    sorted_pool_index.push_back(i);
+  }
+  auto pool_comparator = [&pool_info](int lhs, int rhs){
+    return pool_info[lhs].second > pool_info[rhs].second;
+  };
+  std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator);
+
+  for (size_t i : sorted_pool_index) {
     const Context& ctx = pool_info[i].first;
     size_t bytes = pool_info[i].second;
     bool allocated = false;
     for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) {
       if (it->second.ctx() == ctx && it->first >= bytes) {
-        data_pool_.push_back(it->second);
+        data_pool_[i] = it->second;
         free_pool.erase(it);
         allocated = true;
         break;
@@ -500,7 +532,12 @@ void GraphExecutor::InitDataEntryMemory(const std::vector<NDArray>& shared_pool)
       CHECK_LE(nword, std::numeric_limits<index_t>::max());
       // allocate float arrays
       TShape shape{index_t(nword)};
-      data_pool_.emplace_back(NDArray(shape, ctx));
+      NDArray nd(shape, ctx);
+      data_pool_[i] = nd;
+      // put the new allocated arrays to shared pool
+      if (shared_pool != nullptr)  {
+        shared_pool->push_back(nd);
+      }
     }
   }
   CHECK_EQ(data_pool_.size(), pool_info.size());
@@ -545,8 +582,8 @@ void GraphExecutor::InitCachedOps() {
     op_nodes_[nid].exec = op_execs[nid];
     op_nodes_[nid].ctx = vctx[nid];
     auto& exec = op_nodes_[nid].exec;
-    CHECK_EQ(exec->in_array.size(), 0);
-    CHECK_EQ(exec->out_array.size(), 0);
+    CHECK_EQ(exec->in_array.size(), 0U);
+    CHECK_EQ(exec->out_array.size(), 0U);
     for (const auto& e : inode.inputs) {
       exec->in_array.push_back(data_entry_[idx.entry_id(e)]);
     }
@@ -646,23 +683,128 @@ void GraphExecutor::InitCachedOps() {
     op_nodes_[nid].cached_opr = Engine::Get()->NewOperator(
         exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
         PROFILER_MESSAGE(op_nodes_[nid].opr_name));
+    op_nodes_[nid].mutate_vars = mutate_vars;
+    op_nodes_[nid].use_vars = use_vars;
   }
 }
 
+void GraphExecutor::InitOpSegs() {
+  size_t total_num_nodes = graph_.indexed_graph().num_nodes();
+  cached_seg_opr_.clear();
+  CachedSegOpr p;
+  cached_seg_opr_.resize(total_num_nodes, p);
+  if (monitor_callback_) return;
+
+  // Generate segments based on the graph structure
+  bool prefer_bulk_exec_inference = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_INFERENCE", true);
+  if (prefer_bulk_exec_inference && num_forward_nodes_ == total_num_nodes) {
+    // bulk the whole graph for inference
+    cached_seg_opr_[0] = this->CreateCachedSegOpr(0, num_forward_nodes_);
+    return;
+  }
+
+  // Whether to perform bulk exec for training
+  bool prefer_bulk_exec = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_TRAIN", 1);
+  // The maximum number of node in a segment executed in bulk
+  size_t num_nodes_threshold = dmlc::GetEnv("MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN", 15);
+  // create forward segments for training
+  if (prefer_bulk_exec > 0) {
+    size_t topo_start = 0;
+    for (size_t nid = 0; nid < num_forward_nodes_; nid++) {
+      auto &node = graph_.indexed_graph()[nid].source;
+      auto &op_node = op_nodes_[nid];
+      // check if the segment relies on external input, or exceeds maxinum number of node,
+      // or requires async ops
+      if (node->is_variable() || nid - topo_start > num_nodes_threshold ||
+          op_node.exec->exec_type() != Operator::kSync) {
+        // create a new segment for the previous nodes if the current one cannot be bulked
+        cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
+        topo_start = nid + 1;
+      }
+    }
+    // the last segmenet
+    if (topo_start != num_forward_nodes_) {
+      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, num_forward_nodes_);
+    }
+  }
+
+  // create backward segments for training
+  if (prefer_bulk_exec) {
+    // get all gradient variables
+    std::unordered_set<engine::VarHandle> grad_vars;
+    for (auto &kv : grad_store_) {
+      grad_vars.insert(kv.second.var());
+    }
+    auto &idx = graph_.indexed_graph();
+    size_t topo_start = num_forward_nodes_;
+    for (size_t nid = num_forward_nodes_; nid < total_num_nodes; nid++) {
+      auto &op_node = op_nodes_[nid];
+      if (op_node.skip_exec_node || op_node.exec == nullptr) {
+        continue;
+      }
+      if (idx[nid].source->is_variable() || nid - topo_start > num_nodes_threshold ||
+          op_node.exec->exec_type() != Operator::kSync) {
+        cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
+        topo_start = nid + 1;
+      } else {
+        // If it produces output gradient, don't include it in the segment
+        bool output_gradient = false;
+        for (auto &out_arr : op_node.exec->out_array) {
+          if (grad_vars.find(out_arr.var()) != grad_vars.end()) {
+            output_gradient = true;
+          }
+        }
+        if (output_gradient) {
+          cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, nid);
+          topo_start = nid + 1;
+        }
+      }
+    }
+    // last segment for backward
+    if (topo_start < total_num_nodes) {
+      cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, total_num_nodes);
+    }
+  }
+  return;
+}
+
 void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
+  // Update context
+  const auto& idx = graph_.indexed_graph();
+  for (size_t nid = topo_start; nid < topo_end; ++nid) {
+    OpNode& opnode = op_nodes_[nid];
+    if (opnode.skip_exec_node) continue;
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    opnode.exec->op_ctx.is_train = is_train;
+  }
+
+  // Push Ops
   static const auto& flist_outputs =
       nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListOutputNames");
-  const auto& idx = graph_.indexed_graph();
   for (size_t nid = topo_start; nid < topo_end; ++nid) {
+    auto seg_op = cached_seg_opr_[nid];
+    // Check segments first
+    if (seg_op.opr != nullptr && seg_op.topo_end <= topo_end) {
+#if MXNET_USE_PROFILER
+      bool profiling = engine::Profiler::Get()->GetState() == engine::Profiler::kRunning;
+#else
+      bool profiling = false;
+#endif
+      Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling);
+      nid = seg_op.topo_end - 1;
+      continue;
+    }
+    // Normal mode
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     OpNode& opnode = op_nodes_[nid];
     if (op_nodes_[nid].skip_exec_node) continue;
     opnode.exec->op_ctx.is_train = is_train;
     if (opnode.exec->exec_type() == Operator::kCrossDeviceCopy) {
-      CHECK_EQ(inode.inputs.size(), 1);
-      CHECK_EQ(opnode.exec->in_array.size(), 1);
-      CHECK_EQ(opnode.exec->out_array.size(), 1);
+      CHECK_EQ(inode.inputs.size(), 1U);
+      CHECK_EQ(opnode.exec->in_array.size(), 1U);
+      CHECK_EQ(opnode.exec->out_array.size(), 1U);
       CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0]));
     } else if (opnode.cached_opr != nullptr) {
 #if MXNET_USE_PROFILER
@@ -694,6 +836,95 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
   }
 }
 
+GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, size_t topo_end) {
+  std::vector<Engine::VarHandle> use_vars;
+  std::vector<Engine::VarHandle> mutate_vars;
+  std::unordered_set<Engine::VarHandle> use_var_set;
+  std::unordered_set<Engine::VarHandle> mutate_var_set;
+  Context *pctx = nullptr;
+  GraphExecutor::CachedSegOpr ret;
+  ret.topo_start = topo_start;
+  ret.topo_end = topo_end;
+  auto& exec_list = ret.exec_list;
+  // invalid segment
+  if (topo_end <= topo_start) {
+    return ret;
+  }
+#if MXNET_USE_PROFILER
+  std::string opr_names = "[";
+#else
+  std::string opr_names = "Bulk Execution";
+#endif
+
+  const auto& idx = graph_.indexed_graph();
+  for (size_t nid = topo_start; nid < topo_end; ++nid) {
+    std::vector<Engine::VarHandle> all_vars;
+    const auto& inode = idx[nid];
+    OpNode& op_node = op_nodes_[nid];
+    if (op_node.skip_exec_node) continue;
+    if (inode.source->is_variable()) continue;
+    if (op_node.exec->exec_type() != Operator::kSync) {
+      return ret;
+    }
+    if (pctx == nullptr) pctx = &(op_node.ctx);
+    if (*pctx != op_node.ctx) {
+      return ret;
+    }
+    auto& exec = op_nodes_[nid].exec;
+    std::copy(op_node.mutate_vars.begin(), op_node.mutate_vars.end(),
+              std::inserter(mutate_var_set, mutate_var_set.end()));
+    std::copy(op_node.use_vars.begin(), op_node.use_vars.end(),
+              std::inserter(use_var_set, use_var_set.end()));
+    ret.exec_list.push_back(exec.get());
+#if MXNET_USE_PROFILER
+    opr_names += inode.source->op()->name + ",";
+#endif
+  }
+
+  if (pctx == nullptr) return ret;
+  ret.ctx = *pctx;
+  // remove mutate vars to const var list
+  for (auto iter = use_var_set.begin(); iter != use_var_set.end();) {
+    if (mutate_var_set.find(*iter) != mutate_var_set.end()) {
+      iter = use_var_set.erase(iter);
+    } else {
+      iter++;
+    }
+  }
+
+  std::copy(mutate_var_set.begin(), mutate_var_set.end(), std::back_inserter(mutate_vars));
+  std::copy(use_var_set.begin(), use_var_set.end(), std::back_inserter(use_vars));
+
+  bool is_gpu = pctx->dev_mask() == gpu::kDevMask;
+  auto exec_fun = [exec_list, is_gpu] (
+      RunContext ctx, Engine::CallbackOnComplete on_complete) {
+    // Run all opr in the sub-graph
+    for (auto &exec : exec_list) {
+      exec->Run(ctx);
+    }
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      // Wait GPU kernel to finish.
+      ctx.get_stream<gpu>()->Wait();
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    }
+    on_complete();
+  };
+#if MXNET_USE_PROFILER
+    opr_names.pop_back();
+    opr_names += "]";
+    // the lifetime of `opr_names.c_str()` is same with opr_names
+    // you need to copy it out. (potential memory leak risk)
+    char *p_opr_name = new char[opr_names.size() + 1];
+    memcpy(p_opr_name, opr_names.c_str(), opr_names.size() + 1);
+#endif
+  ret.opr = Engine::Get()->NewOperator(
+      exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
+      PROFILER_MESSAGE(p_opr_name));
+  return ret;
+}
 }  // namespace exec
 
 Executor *Executor::Bind(nnvm::Symbol symbol,
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index cae7c28aafd6..46b33a55fc76 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -28,6 +28,7 @@ using nnvm::Graph;
 class GraphExecutor : public Executor {
  public:
   using Executor::MonitorCallback;
+
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
   void PartialForward(bool is_train, int step, int *step_left) override;
@@ -58,7 +59,25 @@ class GraphExecutor : public Executor {
     bool skip_exec_node{false};
     // cached operator handle
     Engine::OprHandle cached_opr{nullptr};
+    // cached const vars, used for seg ops creation
+    std::vector<Engine::VarHandle> use_vars;
+    // cached mutate vars, used for seg ops creation
+    std::vector<Engine::VarHandle> mutate_vars;
+  };
+  // a cached segment operator that executes a segment
+  struct CachedSegOpr {
+    // context of the operator
+    Context ctx;
+    // begin in topo order
+    size_t topo_start;
+    // end in topo order
+    size_t topo_end;
+    // the cached operator
+    Engine::OprHandle opr = nullptr;
+    // list of op executors
+    std::vector<OpExecutor*> exec_list;
   };
+
   // internal initialization of the graph.
   Graph InitGraph(nnvm::Symbol symbol,
                   const Context& default_ctx,
@@ -73,12 +92,23 @@ class GraphExecutor : public Executor {
                       const std::vector<NDArray>& arg_grad_store);
   // initialize the cached operator
   void InitCachedOps();
+  // initialize the opr segments for bulk exec
+  void InitOpSegs();
   // initialize the resources in the graph
   // initialize the memory of data entries
   // shared_pool: extra memory shared from other parts
-  void InitDataEntryMemory(const std::vector<NDArray>& shared_pool);
+  void InitDataEntryMemory(std::vector<NDArray>* shared_pool);
   // run ops from topo order start to end
   void RunOps(bool is_train, size_t topo_start, size_t topo_end);
+  /*!
+   * \brief Try to create a cached operator to run segments between start and end
+   * \param topo_start beginning of segment
+   * \param topo_end end of segment
+   * \return the cached operator.
+   *  ret.opr Can be nullptr if creation failed.
+  */
+  CachedSegOpr CreateCachedSegOpr(size_t topo_start, size_t topo_end);
+
   // internal graph
   nnvm::Graph graph_;
   // operator node
@@ -105,6 +135,10 @@ class GraphExecutor : public Executor {
   size_t num_forward_nodes_{0};
   // monitor call back
   std::function<void(const char*, void*)> monitor_callback_{nullptr};
+  // whether to enable bulk execution
+  bool prefer_bulk_execution_;
+  // cached segment operator
+  std::vector<CachedSegOpr> cached_seg_opr_;
 };
 
 }  // namespace exec
diff --git a/src/initialize.cc b/src/initialize.cc
index c9fe1f3ca14a..607b24493f8a 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -3,17 +3,57 @@
  * \file initialize.cc
  * \brief initialize mxnet library
  */
+#include <signal.h>
 #include <dmlc/logging.h>
+#include <mxnet/engine.h>
+
+#include "engine/profiler.h"
 
 namespace mxnet {
 
+void segfault_logger(int sig) {
+  const int MAX_STACK_SIZE = 10;
+  void *stack[MAX_STACK_SIZE];
+
+  fprintf(stderr, "\nSegmentation fault: %d\n\n", sig);
+
+#if DMLC_LOG_STACK_TRACE
+  int nframes = backtrace(stack, MAX_STACK_SIZE);
+  fprintf(stderr, "Stack trace returned %d entries:\n", nframes);
+  char **msgs = backtrace_symbols(stack, nframes);
+  if (msgs != nullptr) {
+    for (int i = 0; i < nframes; ++i) {
+      fprintf(stderr, "[bt] (%d) %s\n", i, msgs[i]);
+    }
+  }
+#endif  // DMLC_LOG_STACK_TRACE
+
+  exit(1);
+}
+
 class LibraryInitializer {
  public:
   LibraryInitializer() {
     dmlc::InitLogging("mxnet");
+    signal(SIGSEGV, segfault_logger);
+#if MXNET_USE_PROFILER
+    // ensure profiler's constructor are called before atexit.
+    engine::Profiler::Get();
+    // DumpProfile will be called before engine's and profiler's destructor.
+    std::atexit([](){
+      engine::Profiler* profiler = engine::Profiler::Get();
+      if (profiler->IsEnableOutput()) {
+        profiler->DumpProfile();
+      }
+    });
+#endif
   }
+
+  static LibraryInitializer* Get();
 };
 
-static LibraryInitializer __library_init;
+LibraryInitializer* LibraryInitializer::Get() {
+  static LibraryInitializer inst;
+  return &inst;
+}
 }  // namespace mxnet
-
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index dfc816d7d356..2a070d89b3cc 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -27,10 +27,6 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
   int resize;
   /*! \brief whether we do random cropping */
   bool rand_crop;
-  /*! \brief where to nonrandom crop on y */
-  int crop_y_start;
-  /*! \brief where to nonrandom crop on x */
-  int crop_x_start;
   /*! \brief [-max_rotate_angle, max_rotate_angle] */
   int max_rotate_angle;
   /*! \brief max aspect ratio */
@@ -68,49 +64,59 @@ struct DefaultImageAugmentParam : public dmlc::Parameter<DefaultImageAugmentPara
   // declare parameters
   DMLC_DECLARE_PARAMETER(DefaultImageAugmentParam) {
     DMLC_DECLARE_FIELD(resize).set_default(-1)
-        .describe("Augmentation Param: scale shorter edge to size "
+        .describe("Down scale the shorter edge to a new size  "
                   "before applying other augmentations.");
     DMLC_DECLARE_FIELD(rand_crop).set_default(false)
-        .describe("Augmentation Param: Whether to random crop on the image");
-    DMLC_DECLARE_FIELD(crop_y_start).set_default(-1)
-        .describe("Augmentation Param: Where to nonrandom crop on y.");
-    DMLC_DECLARE_FIELD(crop_x_start).set_default(-1)
-        .describe("Augmentation Param: Where to nonrandom crop on x.");
+        .describe("If or not randomly crop the image");
     DMLC_DECLARE_FIELD(max_rotate_angle).set_default(0.0f)
-        .describe("Augmentation Param: rotated randomly in [-max_rotate_angle, max_rotate_angle].");
+        .describe("Rotate by a random degree in ``[-v, v]``");
     DMLC_DECLARE_FIELD(max_aspect_ratio).set_default(0.0f)
-        .describe("Augmentation Param: denotes the max ratio of random aspect ratio augmentation.");
+        .describe("Change the aspect (namely width/height) to a random value "
+                  "in ``[1 - max_aspect_ratio, 1 + max_aspect_ratio]``");
     DMLC_DECLARE_FIELD(max_shear_ratio).set_default(0.0f)
-        .describe("Augmentation Param: denotes the max random shearing ratio.");
+        .describe("Apply a shear transformation (namely ``(x,y)->(x+my,y)``) "
+                  "with ``m`` randomly chose from "
+                  "``[-max_shear_ratio, max_shear_ratio]``");
     DMLC_DECLARE_FIELD(max_crop_size).set_default(-1)
-        .describe("Augmentation Param: Maximum crop size.");
+        .describe("Crop both width and height into a random size in "
+                  "``[min_crop_size, max_crop_size]``");
     DMLC_DECLARE_FIELD(min_crop_size).set_default(-1)
-        .describe("Augmentation Param: Minimum crop size.");
+        .describe("Crop both width and height into a random size in "
+                  "``[min_crop_size, max_crop_size]``");
     DMLC_DECLARE_FIELD(max_random_scale).set_default(1.0f)
-        .describe("Augmentation Param: Maximum scale ratio.");
+        .describe("Resize into ``[width*s, height*s]`` with ``s`` randsomly"
+                  " chosen from ``[min_random_scale, max_random_scale]``");
     DMLC_DECLARE_FIELD(min_random_scale).set_default(1.0f)
-        .describe("Augmentation Param: Minimum scale ratio.");
+        .describe("Resize into ``[width*s, height*s]`` with ``s`` randsomly"
+                  " chosen from ``[min_random_scale, max_random_scale]``");
     DMLC_DECLARE_FIELD(max_img_size).set_default(1e10f)
-        .describe("Augmentation Param: Maximum image size after resizing.");
+        .describe("Set the maximal width and height after all resize and"
+                  " rotate argumentation  are applied");
     DMLC_DECLARE_FIELD(min_img_size).set_default(0.0f)
-        .describe("Augmentation Param: Minimum image size after resizing.");
+        .describe("Set the minimal width and height after all resize and"
+                  " rotate argumentation  are applied");
     DMLC_DECLARE_FIELD(random_h).set_default(0)
-        .describe("Augmentation Param: Maximum random value of H channel in HSL color space.");
+        .describe("Add a random value in ``[-random_h, random_h]`` to "
+                  "the H channel in HSL color space.");
     DMLC_DECLARE_FIELD(random_s).set_default(0)
-        .describe("Augmentation Param: Maximum random value of S channel in HSL color space.");
+        .describe("Add a random value in ``[-random_s, random_s]`` to "
+                  "the S channel in HSL color space.");
     DMLC_DECLARE_FIELD(random_l).set_default(0)
-        .describe("Augmentation Param: Maximum random value of L channel in HSL color space.");
+        .describe("Add a random value in ``[-random_l, random_l]`` to "
+                  "the L channel in HSL color space.");
     DMLC_DECLARE_FIELD(rotate).set_default(-1.0f)
-        .describe("Augmentation Param: Rotate angle.");
+        .describe("Rotate by an angle. If set, it overrites the ``max_rotate_angle`` option.");
     DMLC_DECLARE_FIELD(fill_value).set_default(255)
-        .describe("Augmentation Param: Filled color value while padding.");
+        .describe("Set the padding pixes value into ``fill_value``.");
     DMLC_DECLARE_FIELD(data_shape)
         .set_expect_ndim(3).enforce_nonzero()
-        .describe("Dataset Param: Shape of each instance generated by the DataIter.");
+        .describe("The shape of a output image.");
     DMLC_DECLARE_FIELD(inter_method).set_default(1)
-        .describe("Augmentation Param: 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand.");
+        .describe("The interpolation method: 0-NN 1-bilinear 2-cubic 3-area "
+                  "4-lanczos4 9-auto 10-rand.");
     DMLC_DECLARE_FIELD(pad).set_default(0)
-        .describe("Augmentation Param: Padding size.");
+        .describe("Change size from ``[width, height]`` into "
+                  "``[pad + width + pad, pad + height + pad]`` by padding pixes");
   }
 };
 
@@ -169,7 +175,7 @@ class DefaultImageAugmenter : public ImageAugmenter {
       return inter_method;
     }
   }
-  cv::Mat Process(const cv::Mat &src,
+  cv::Mat Process(const cv::Mat &src, std::vector<float> *label,
                   common::RANDOM_ENGINE *prnd) override {
     using mshadow::index_t;
     cv::Mat res;
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 00d7ddd3fc70..6c0ad98bee7f 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -38,7 +38,7 @@ class ImageAugmenter {
    * \param prnd pointer to random number generator.
    * \return The processed image.
    */
-  virtual cv::Mat Process(const cv::Mat &src,
+  virtual cv::Mat Process(const cv::Mat &src, std::vector<float> *label,
                           common::RANDOM_ENGINE *prnd) = 0;
   // virtual destructor
   virtual ~ImageAugmenter() {}
@@ -84,6 +84,7 @@ namespace mxnet {
 namespace io {
 /*! \return the parameter of default augmenter */
 std::vector<dmlc::ParamFieldInfo> ListDefaultAugParams();
+std::vector<dmlc::ParamFieldInfo> ListDefaultDetAugParams();
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_IMAGE_AUGMENTER_H_
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
new file mode 100644
index 000000000000..10a9fa11604d
--- /dev/null
+++ b/src/io/image_det_aug_default.cc
@@ -0,0 +1,666 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file image_det_aug_default.cc
+ * \brief Default augmenter.
+ */
+#include <mxnet/base.h>
+#include <utility>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include "./image_augmenter.h"
+#include "../common/utils.h"
+
+namespace mxnet {
+namespace io {
+using nnvm::Tuple;
+using Rect = cv::Rect_<float>;
+
+namespace image_det_aug_default_enum {
+enum ImageDetAugDefaultCropEmitMode {kCenter, kOverlap};
+enum ImageDetAugDefaultResizeMode {kForce, kShrink, kFit};
+}
+
+/*! \brief image detection augmentation parameters*/
+struct DefaultImageDetAugmentParam : public dmlc::Parameter<DefaultImageDetAugmentParam> {
+  /*! \brief resize shorter edge to size before applying other augmentations */
+  int resize;
+  /*! \brief probability we do random cropping, use prob <= 0 to disable */
+  float rand_crop_prob;
+  /*! \brief min crop scales */
+  Tuple<float> min_crop_scales;
+  /*! \brief max crop scales */
+  Tuple<float> max_crop_scales;
+  /*! \brief min crop aspecct ratios */
+  Tuple<float> min_crop_aspect_ratios;
+  /*! \brief max crop aspect ratios */
+  Tuple<float> max_crop_aspect_ratios;
+  /*! \brief min IOUs between ground-truths and crop boxes */
+  Tuple<float> min_crop_overlaps;
+  /*! \brief max IOUs between ground-truths and crop boxes */
+  Tuple<float> max_crop_overlaps;
+  /*! \brief min itersection/gt_area between ground-truths and crop boxes */
+  Tuple<float> min_crop_sample_coverages;
+  /*! \brief max itersection/gt_area between ground-truths and crop boxes */
+  Tuple<float> max_crop_sample_coverages;
+  /*! \brief min itersection/crop_area between ground-truths and crop boxes */
+  Tuple<float> min_crop_object_coverages;
+  /*! \brief max itersection/crop_area between ground-truths and crop boxes */
+  Tuple<float> max_crop_object_coverages;
+  /*! \brief number of crop samplers, skip random crop if <= 0 */
+  int num_crop_sampler;
+  /*! \beief 0-emit ground-truth if center out of crop area
+   * 1-emit if overlap < emit_overlap_thresh
+   */
+  int crop_emit_mode;
+  /*! \brief ground-truth emition threshold specific for crop_emit_mode == 1 */
+  float emit_overlap_thresh;
+  /*! \brief maximum trials for cropping, skip cropping if fails exceed this number */
+  Tuple<int> max_crop_trials;
+  /*! \brief random padding prob */
+  float rand_pad_prob;
+  /*!< \brief maximum padding scale */
+  float max_pad_scale;
+  /*! \brief max random in H channel */
+  int max_random_hue;
+  /*! \brief random H prob */
+  float random_hue_prob;
+  /*! \brief max random in S channel */
+  int max_random_saturation;
+  /*! \brief random saturation prob */
+  float random_saturation_prob;
+  /*! \brief max random in L channel */
+  int max_random_illumination;
+  /*! \brief random illumination change prob */
+  float random_illumination_prob;
+  /*! \brief max random contrast */
+  float max_random_contrast;
+  /*! \brief random contrast prob */
+  float random_contrast_prob;
+  /*! \brief random mirror prob */
+  float rand_mirror_prob;
+  /*! \brief filled color while padding */
+  int fill_value;
+  /*! \brief interpolation method 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand  */
+  int inter_method;
+  /*! \brief shape of the image data */
+  TShape data_shape;
+  /*! \brief resize mode, 0-force
+   * 1-Shrink to data_shape, preserve ratio,
+   * 2-fit to data_shape, preserve ratio
+   */
+  int resize_mode;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(DefaultImageDetAugmentParam) {
+    DMLC_DECLARE_FIELD(resize).set_default(-1)
+        .describe("Augmentation Param: scale shorter edge to size "
+                  "before applying other augmentations, -1 to disable.");
+    DMLC_DECLARE_FIELD(rand_crop_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability of random cropping, <= 0 to disable");
+    DMLC_DECLARE_FIELD(min_crop_scales).set_default(Tuple<float>({0.0f}))
+        .describe("Augmentation Param: Min crop scales.");
+    DMLC_DECLARE_FIELD(max_crop_scales).set_default(Tuple<float>({1.0f}))
+        .describe("Augmentation Param: Max crop scales.");
+    DMLC_DECLARE_FIELD(min_crop_aspect_ratios).set_default(Tuple<float>({1.0f}))
+        .describe("Augmentation Param: Min crop aspecct ratios.");
+    DMLC_DECLARE_FIELD(max_crop_aspect_ratios).set_default(Tuple<float>({1.0f}))
+        .describe("Augmentation Param: Max crop aspect ratios.");
+    DMLC_DECLARE_FIELD(min_crop_overlaps).set_default(Tuple<float>({0.0f}))
+        .describe("Augmentation Param: Minimum crop IOU between crop_box and ground-truths.");
+    DMLC_DECLARE_FIELD(max_crop_overlaps).set_default(Tuple<float>({1.0f}))
+        .describe("Augmentation Param: Maximum crop IOU between crop_box and ground-truth.");
+    DMLC_DECLARE_FIELD(min_crop_sample_coverages).set_default(Tuple<float>({0.0f}))
+        .describe("Augmentation Param: Minimum ratio of intersect/crop_area "
+                  "between crop box and ground-truths.");
+    DMLC_DECLARE_FIELD(max_crop_sample_coverages).set_default(Tuple<float>({1.0f}))
+        .describe("Augmentation Param: Maximum ratio of intersect/crop_area "
+                  "between crop box and ground-truths.");
+    DMLC_DECLARE_FIELD(min_crop_object_coverages).set_default(Tuple<float>({0.0f}))
+        .describe("Augmentation Param: Minimum ratio of intersect/gt_area "
+                  "between crop box and ground-truths.");
+    DMLC_DECLARE_FIELD(max_crop_object_coverages).set_default(Tuple<float>({1.0f}))
+        .describe("Augmentation Param: Maximum ratio of intersect/gt_area "
+                  "between crop box and ground-truths.");
+    DMLC_DECLARE_FIELD(num_crop_sampler).set_default(1)
+        .describe("Augmentation Param: Number of crop samplers.");
+    DMLC_DECLARE_FIELD(crop_emit_mode)
+        .add_enum("center", image_det_aug_default_enum::kCenter)
+        .add_enum("overlap", image_det_aug_default_enum::kOverlap)
+        .set_default(image_det_aug_default_enum::kCenter)
+        .describe("Augmentation Param: Emition mode for invalid ground-truths after crop. "
+                  "center: emit if centroid of object is out of crop region; "
+                  "overlap: emit if overlap is less than emit_overlap_thresh. ");
+    DMLC_DECLARE_FIELD(emit_overlap_thresh).set_default(0.3f)
+        .describe("Augmentation Param: Emit overlap thresh for emit mode overlap only.");
+    DMLC_DECLARE_FIELD(max_crop_trials).set_default(Tuple<int>({25}))
+        .describe("Augmentation Param: Skip cropping if fail crop trail count "
+                  "exceeds this number.");
+    DMLC_DECLARE_FIELD(rand_pad_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability for random padding.");
+    DMLC_DECLARE_FIELD(max_pad_scale).set_default(1.0f)
+        .describe("Augmentation Param: Maximum padding scale.");
+    DMLC_DECLARE_FIELD(max_random_hue).set_default(0)
+        .describe("Augmentation Param: Maximum random value of H channel in HSL color space.");
+    DMLC_DECLARE_FIELD(random_hue_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability to apply random hue.");
+    DMLC_DECLARE_FIELD(max_random_saturation).set_default(0)
+        .describe("Augmentation Param: Maximum random value of S channel in HSL color space.");
+    DMLC_DECLARE_FIELD(random_saturation_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability to apply random saturation.");
+    DMLC_DECLARE_FIELD(max_random_illumination).set_default(0)
+        .describe("Augmentation Param: Maximum random value of L channel in HSL color space.");
+    DMLC_DECLARE_FIELD(random_illumination_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability to apply random illumination.");
+    DMLC_DECLARE_FIELD(max_random_contrast).set_default(0)
+        .describe("Augmentation Param: Maximum random value of delta contrast.");
+    DMLC_DECLARE_FIELD(random_contrast_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability to apply random contrast.");
+    DMLC_DECLARE_FIELD(rand_mirror_prob).set_default(0.0f)
+        .describe("Augmentation Param: Probability to apply horizontal flip aka. mirror.");
+    DMLC_DECLARE_FIELD(fill_value).set_default(127)
+        .describe("Augmentation Param: Filled color value while padding.");
+    DMLC_DECLARE_FIELD(inter_method).set_default(1)
+        .describe("Augmentation Param: 0-NN 1-bilinear 2-cubic 3-area 4-lanczos4 9-auto 10-rand.");
+    DMLC_DECLARE_FIELD(data_shape)
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Dataset Param: Shape of each instance generated by the DataIter.");
+    DMLC_DECLARE_FIELD(resize_mode)
+      .add_enum("force", image_det_aug_default_enum::kForce)
+      .add_enum("shrink", image_det_aug_default_enum::kShrink)
+      .add_enum("fit", image_det_aug_default_enum::kFit)
+      .set_default(image_det_aug_default_enum::kForce)
+      .describe("Augmentation Param: How image data fit in data_shape. "
+                "force: force reshape to data_shape regardless of aspect ratio; "
+                "shrink: ensure each side fit in data_shape, preserve aspect ratio; "
+                "fit: fit image to data_shape, preserve ratio, will upscale if applicable.");
+  }
+};
+
+DMLC_REGISTER_PARAMETER(DefaultImageDetAugmentParam);
+
+std::vector<dmlc::ParamFieldInfo> ListDefaultDetAugParams() {
+  return DefaultImageDetAugmentParam::__FIELDS__();
+}
+
+#if MXNET_USE_OPENCV
+
+#ifdef _MSC_VER
+#define M_PI CV_PI
+#endif
+
+/*! \brief helper class for better detection label handling */
+class ImageDetLabel {
+ public:
+  /*! \brief Helper struct to store the coordinates and id for each object */
+  struct ImageDetObject {
+    float id;
+    float left;
+    float top;
+    float right;
+    float bottom;
+    std::vector<float> extra;  // store extra info other than id and coordinates
+
+    /*! \brief Return converted Rect object */
+    Rect ToRect() const {
+      return Rect(left, top, right - left, bottom - top);
+    }
+
+     /*! \brief Return projected coordinates according to new region */
+     ImageDetObject Project(Rect box) const {
+       ImageDetObject ret = *this;
+       ret.left = std::max(0.f, (ret.left - box.x) / box.width);
+       ret.top = std::max(0.f, (ret.top - box.y) / box.height);
+       ret.right = std::min(1.f, (ret.right - box.x) / box.width);
+       ret.bottom = std::min(1.f, (ret.bottom - box.y) / box.height);
+       return ret;
+     }
+
+     /*! \brief Return Horizontally fliped coordinates */
+     ImageDetObject HorizontalFlip() const {
+       ImageDetObject ret = *this;
+       ret.left = 1.f - this->right;
+       ret.right = 1.f - this->left;
+       return ret;
+     }
+  };  // struct ImageDetObject
+
+  /*! \brief constructor from raw array of detection labels */
+  explicit ImageDetLabel(const std::vector<float> &raw_label) {
+    FromArray(raw_label);
+  }
+
+  /*! \brief construct from raw array with following format
+   * header_width, object_width, (extra_headers...),
+   * [id, xmin, ymin, xmax, ymax, (extra_object_info)] x N
+   */
+  void FromArray(const std::vector<float> &raw_label) {
+    int label_width = static_cast<int>(raw_label.size());
+    CHECK_GE(label_width, 7);  // at least 2(header) + 5(1 object)
+    int header_width = static_cast<int>(raw_label[0]);
+    CHECK_GE(header_width, 2);
+    object_width_ = static_cast<int>(raw_label[1]);
+    CHECK_GE(object_width_, 5);  // id, x1, y1, x2, y2...
+    header_.assign(raw_label.begin(), raw_label.begin() + header_width);
+    int num = (label_width - header_width) / object_width_;
+    CHECK_EQ((label_width - header_width) % object_width_, 0);
+    objects_.reserve(num);
+    for (int i = header_width; i < label_width; i += object_width_) {
+      ImageDetObject obj;
+      auto it = raw_label.cbegin() + i;
+      obj.id = *(it++);
+      obj.left = *(it++);
+      obj.top = *(it++);
+      obj.right = *(it++);
+      obj.bottom = *(it++);
+      obj.extra.assign(it, it - 5 + object_width_);
+      objects_.push_back(obj);
+      CHECK_GT(obj.right, obj.left);
+      CHECK_GT(obj.bottom, obj.top);
+    }
+  }
+
+  /*! \brief Convert back to raw array */
+  std::vector<float> ToArray() const {
+    std::vector<float> out(header_);
+    out.reserve(out.size() + objects_.size() * object_width_);
+    for (auto& obj : objects_) {
+      out.push_back(obj.id);
+      out.push_back(obj.left);
+      out.push_back(obj.top);
+      out.push_back(obj.right);
+      out.push_back(obj.bottom);
+      out.insert(out.end(), obj.extra.begin(), obj.extra.end());
+    }
+    return out;
+  }
+
+  /*! \brief Intersection over Union between two rects */
+  static float RectIOU(Rect a, Rect b) {
+    float intersect = (a & b).area();
+    if (intersect <= 0.f) return 0.f;
+    return intersect / (a.area() + b.area() - intersect);
+  }
+
+  /*! \brief try crop image with given crop_box
+   * return false if fail to meet any of the constraints
+   * convert all objects if success
+   */
+  bool TryCrop(const Rect crop_box,
+    const float min_crop_overlap, const float max_crop_overlap,
+    const float min_crop_sample_coverage, const float max_crop_sample_coverage,
+    const float min_crop_object_coverage, const float max_crop_object_coverage,
+    const int crop_emit_mode, const float emit_overlap_thresh) {
+    if (objects_.size() < 1) {
+      return true;  // no object, raise error or just skip?
+    }
+    // check if crop_box valid
+    bool valid = false;
+    if (min_crop_overlap > 0.f && max_crop_overlap < 1.f &&
+        min_crop_sample_coverage > 0.f && max_crop_sample_coverage < 1.f &&
+        min_crop_object_coverage > 0.f && max_crop_object_coverage < 1.f) {
+      for (auto& obj : objects_) {
+        Rect gt_box = obj.ToRect();
+        if (min_crop_overlap > 0.f || max_crop_overlap < 1.f) {
+          float ovp = RectIOU(crop_box, gt_box);
+          if (ovp < min_crop_overlap || ovp > max_crop_overlap) {
+            continue;
+          }
+        }
+        if (min_crop_sample_coverage > 0.f || max_crop_sample_coverage < 1.f) {
+          float c = (crop_box & gt_box).area() / crop_box.area();
+          if (c < min_crop_sample_coverage || c > max_crop_sample_coverage) {
+            continue;
+          }
+        }
+        if (min_crop_object_coverage > 0.f || max_crop_object_coverage < 1.f) {
+          float c = (crop_box & gt_box).area() / gt_box.area();
+          if (c < min_crop_object_coverage || c > max_crop_object_coverage) {
+            continue;
+          }
+        }
+        valid = true;
+        break;
+      }
+    } else {
+      valid = true;
+    }
+
+    if (!valid) return false;
+    // transform ground-truth labels
+    std::vector<ImageDetObject> new_objects;
+    for (auto iter = objects_.begin(); iter != objects_.end(); ++iter) {
+      if (image_det_aug_default_enum::kCenter == crop_emit_mode) {
+        float center_x = (iter->left + iter->right) * 0.5f;
+        float center_y = (iter->top + iter->bottom) * 0.5f;
+        if (!crop_box.contains(cv::Point2f(center_x, center_y))) {
+          continue;
+        }
+        new_objects.push_back(iter->Project(crop_box));
+      } else if (image_det_aug_default_enum::kOverlap == crop_emit_mode) {
+        Rect gt_box = iter->ToRect();
+        float overlap = (crop_box & gt_box).area() / gt_box.area();
+        if (overlap > emit_overlap_thresh) {
+          new_objects.push_back(iter->Project(crop_box));
+        }
+      }
+    }
+    if (new_objects.size() < 1) return false;
+    objects_ = new_objects;  // replace the old objects
+    return true;
+  }
+
+  /*! \brief try pad image with given pad_box
+   * convert all objects afterwards
+   */
+  bool TryPad(const Rect pad_box) {
+    // update all objects inplace
+    for (auto it = objects_.begin(); it != objects_.end(); ++it) {
+      *it = it->Project(pad_box);
+    }
+    return true;
+  }
+
+  /*! \brief flip image and object coordinates horizontally */
+  bool TryMirror() {
+    // flip all objects horizontally
+    for (auto it = objects_.begin(); it != objects_.end(); ++it) {
+      *it = it->HorizontalFlip();
+    }
+    return true;
+  }
+
+ private:
+  /*! \brief width for each object information, 5 at least */
+  int object_width_;
+  /*! \brief vector to store original header info */
+  std::vector<float> header_;
+  /*! \brief storing objects in more convenient formats */
+  std::vector<ImageDetObject> objects_;
+};  // class ImageDetLabel
+
+/*! \brief helper class to do image augmentation */
+class DefaultImageDetAugmenter : public ImageAugmenter {
+ public:
+  // contructor
+  DefaultImageDetAugmenter() {}
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+
+    CHECK((param_.inter_method >= 1 && param_.inter_method <= 4) ||
+     (param_.inter_method >= 9 && param_.inter_method <= 10))
+      << "invalid inter_method: valid value 0,1,2,3,9,10";
+
+    // validate crop parameters
+    ValidateCropParameters(&param_.min_crop_scales, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.max_crop_scales, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.min_crop_aspect_ratios, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.max_crop_aspect_ratios, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.min_crop_overlaps, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.max_crop_overlaps, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.min_crop_sample_coverages, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.max_crop_sample_coverages, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.min_crop_object_coverages, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.max_crop_object_coverages, param_.num_crop_sampler);
+    ValidateCropParameters(&param_.max_crop_trials, param_.num_crop_sampler);
+    for (int i = 0; i < param_.num_crop_sampler; ++i) {
+      CHECK_GE(param_.min_crop_scales[i], 0.0f);
+      CHECK_LE(param_.max_crop_scales[i], 1.0f);
+      CHECK_GT(param_.max_crop_scales[i], param_.min_crop_scales[i]);
+      CHECK_GE(param_.min_crop_aspect_ratios[i], 0.0f);
+      CHECK_GE(param_.max_crop_aspect_ratios[i], param_.min_crop_aspect_ratios[i]);
+      CHECK_GE(param_.max_crop_overlaps[i], param_.min_crop_overlaps[i]);
+      CHECK_GE(param_.max_crop_sample_coverages[i], param_.min_crop_sample_coverages[i]);
+      CHECK_GE(param_.max_crop_object_coverages[i], param_.min_crop_object_coverages[i]);
+    }
+    CHECK_GE(param_.emit_overlap_thresh, 0.0f);
+  }
+  /*!
+   * \brief get interpolation method with given inter_method, 0-CV_INTER_NN 1-CV_INTER_LINEAR 2-CV_INTER_CUBIC
+   * \ 3-CV_INTER_AREA 4-CV_INTER_LANCZOS4 9-AUTO(cubic for enlarge, area for shrink, bilinear for others) 10-RAND
+   */
+  int GetInterMethod(int inter_method, int old_width, int old_height, int new_width,
+    int new_height, common::RANDOM_ENGINE *prnd) {
+    if (inter_method == 9) {
+      if (new_width > old_width && new_height > old_height) {
+        return 2;  // CV_INTER_CUBIC for enlarge
+      } else if (new_width <old_width && new_height < old_height) {
+        return 3;  // CV_INTER_AREA for shrink
+      } else {
+        return 1;  // CV_INTER_LINEAR for others
+      }
+      } else if (inter_method == 10) {
+      std::uniform_int_distribution<size_t> rand_uniform_int(0, 4);
+      return rand_uniform_int(*prnd);
+    } else {
+      return inter_method;
+    }
+  }
+
+  /*! \brief Check number of crop samplers and given parameters */
+  template<typename DType>
+  void ValidateCropParameters(nnvm::Tuple<DType> *param, const int num_sampler) {
+    if (num_sampler == 1) {
+      CHECK_EQ(param->ndim(), 1);
+    } else if (num_sampler > 1) {
+      if (param->ndim() == 1) {
+        std::vector<DType> vec(num_sampler, (*param)[0]);
+        param->assign(vec.begin(), vec.end());
+      } else {
+        CHECK_EQ(param->ndim(), num_sampler) << "# of parameters/crop_samplers mismatch ";
+      }
+    }
+  }
+
+  /*! \brief Generate crop box region given cropping parameters */
+  Rect GenerateCropBox(const float min_crop_scale,
+    const float max_crop_scale, const float min_crop_aspect_ratio,
+    const float max_crop_aspect_ratio, common::RANDOM_ENGINE *prnd,
+    const float img_aspect_ratio) {
+    float new_scale = std::uniform_real_distribution<float>(
+        min_crop_scale, max_crop_scale)(*prnd) + 1e-12f;
+    float min_ratio = std::max<float>(min_crop_aspect_ratio / img_aspect_ratio,
+        new_scale * new_scale);
+    float max_ratio = std::min<float>(max_crop_aspect_ratio / img_aspect_ratio,
+        1. / new_scale * new_scale);
+    float new_ratio = std::sqrt(std::uniform_real_distribution<float>(
+        min_ratio, max_ratio)(*prnd));
+    float new_width = std::min(1.f, new_scale * new_ratio);
+    float new_height = std::min(1.f, new_scale / new_ratio);
+    float x0 = std::uniform_real_distribution<float>(0.f, 1 - new_width)(*prnd);
+    float y0 = std::uniform_real_distribution<float>(0.f, 1 - new_height)(*prnd);
+    return Rect(x0, y0, new_width, new_height);
+  }
+
+  /*! \brief Generate padding box region given padding parameters */
+  Rect GeneratePadBox(const float max_pad_scale,
+    common::RANDOM_ENGINE *prnd, const float threshold = 1.05f) {
+      float new_scale = std::uniform_real_distribution<float>(
+        1.f, max_pad_scale)(*prnd);
+      if (new_scale < threshold) return Rect(0, 0, 0, 0);
+      auto rand_uniform = std::uniform_real_distribution<float>(0.f, new_scale - 1);
+      float x0 = rand_uniform(*prnd);
+      float y0 = rand_uniform(*prnd);
+      return Rect(-x0, -y0, new_scale, new_scale);
+    }
+
+  cv::Mat Process(const cv::Mat &src, std::vector<float> *label,
+                  common::RANDOM_ENGINE *prnd) override {
+    using mshadow::index_t;
+    cv::Mat res;
+    if (param_.resize != -1) {
+      int new_height, new_width;
+      if (src.rows > src.cols) {
+        new_height = param_.resize*src.rows/src.cols;
+        new_width = param_.resize;
+      } else {
+        new_height = param_.resize;
+        new_width = param_.resize*src.cols/src.rows;
+      }
+      int interpolation_method = GetInterMethod(param_.inter_method,
+                   src.cols, src.rows, new_width, new_height, prnd);
+      cv::resize(src, res, cv::Size(new_width, new_height),
+                   0, 0, interpolation_method);
+    } else {
+      res = src;
+    }
+
+    // build a helper class for processing labels
+    ImageDetLabel det_label(*label);
+    // random engine
+    std::uniform_real_distribution<float> rand_uniform(0, 1);
+
+    // color space augmentation
+    if (param_.random_hue_prob > 0.f || param_.random_saturation_prob > 0.f ||
+        param_.random_illumination_prob > 0.f || param_.random_contrast_prob > 0.f) {
+      std::uniform_real_distribution<float> uniform_range(-1.f, 1.f);
+      int h = uniform_range(*prnd) * param_.max_random_hue;
+      int s = uniform_range(*prnd) * param_.max_random_saturation;
+      int l = uniform_range(*prnd) * param_.max_random_illumination;
+      float c = uniform_range(*prnd) * param_.max_random_contrast;
+      h = rand_uniform(*prnd) < param_.random_hue_prob ? h : 0;
+      s = rand_uniform(*prnd) < param_.random_saturation_prob ? s : 0;
+      l = rand_uniform(*prnd) < param_.random_illumination_prob ? l : 0;
+      c = rand_uniform(*prnd) < param_.random_contrast_prob ? c : 0;
+      if (h != 0 || s != 0 || l != 0) {
+        int temp[3] = {h, l, s};
+        int limit[3] = {180, 255, 255};
+        cv::cvtColor(res, res, CV_BGR2HLS);
+        for (int i = 0; i < res.rows; ++i) {
+          for (int j = 0; j < res.cols; ++j) {
+            for (int k = 0; k < 3; ++k) {
+              int v = res.at<cv::Vec3b>(i, j)[k];
+              v += temp[k];
+              v = std::max(0, std::min(limit[k], v));
+              res.at<cv::Vec3b>(i, j)[k] = v;
+            }
+          }
+        }
+        cv::cvtColor(res, res, CV_HLS2BGR);
+      }
+      if (fabs(c) > 1e-3) {
+        cv::Mat tmp = res;
+        tmp.convertTo(res, -1, c + 1.f, 0);
+      }
+    }
+
+    // random mirror logic
+    if (param_.rand_mirror_prob > 0 && rand_uniform(*prnd) < param_.rand_mirror_prob) {
+      if (det_label.TryMirror()) {
+        // flip image
+        cv::flip(res, temp_, 1);
+        res = temp_;
+      }
+    }
+
+    // random padding logic
+    if (param_.rand_pad_prob > 0 && param_.max_pad_scale > 1.f) {
+      if (rand_uniform(*prnd) < param_.rand_pad_prob) {
+        Rect pad_box = GeneratePadBox(param_.max_pad_scale, prnd);
+        if (pad_box.area() > 0) {
+          if (det_label.TryPad(pad_box)) {
+            // pad image
+            temp_ = res;
+            int left = static_cast<int>(-pad_box.x * res.cols);
+            int top = static_cast<int>(-pad_box.y * res.rows);
+            int right = static_cast<int>((pad_box.width + pad_box.x - 1) * res.cols);
+            int bot = static_cast<int>((pad_box.height + pad_box.y - 1) * res.rows);
+            cv::copyMakeBorder(temp_, res, top, bot, left, right, cv::BORDER_ISOLATED,
+              cv::Scalar(param_.fill_value, param_.fill_value, param_.fill_value));
+          }
+        }
+      }
+    }
+
+    // random crop logic
+    if (param_.rand_crop_prob > 0 && param_.num_crop_sampler > 0) {
+      if (rand_uniform(*prnd) < param_.rand_crop_prob) {
+        // random crop sampling logic: randomly pick a sampler, return if success
+        // continue to next sampler if failed(exceed max_trial)
+        // return original sample if every sampler has failed
+        std::vector<int> indices(param_.num_crop_sampler);
+        for (int i = 0; i < param_.num_crop_sampler; ++i) {
+          indices[i] = i;
+        }
+        std::shuffle(indices.begin(), indices.end(), *prnd);
+        int num_processed = 0;
+        for (auto idx : indices) {
+          if (num_processed > 0) break;
+          for (int t = 0; t < param_.max_crop_trials[idx]; ++t) {
+            Rect crop_box = GenerateCropBox(param_.min_crop_scales[idx],
+              param_.max_crop_scales[idx], param_.min_crop_aspect_ratios[idx],
+              param_.max_crop_aspect_ratios[idx], prnd,
+              static_cast<float>(res.cols) / res.rows);
+            if (det_label.TryCrop(crop_box, param_.min_crop_overlaps[idx],
+                param_.max_crop_overlaps[idx], param_.min_crop_sample_coverages[idx],
+                param_.max_crop_sample_coverages[idx], param_.min_crop_object_coverages[idx],
+                param_.max_crop_object_coverages[idx], param_.crop_emit_mode,
+                param_.emit_overlap_thresh)) {
+              ++num_processed;
+              // crop image
+              int left = static_cast<int>(crop_box.x * res.cols);
+              int top = static_cast<int>(crop_box.y * res.rows);
+              int width = static_cast<int>(crop_box.width * res.cols);
+              int height = static_cast<int>(crop_box.height * res.rows);
+              res = res(cv::Rect(left, top, width, height));
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    if (image_det_aug_default_enum::kForce == param_.resize_mode) {
+      // force resize to specified data_shape, regardless of aspect ratio
+      int new_height = param_.data_shape[1];
+      int new_width = param_.data_shape[2];
+      int interpolation_method = GetInterMethod(param_.inter_method,
+                   res.cols, res.rows, new_width, new_height, prnd);
+      cv::resize(res, res, cv::Size(new_width, new_height),
+                   0, 0, interpolation_method);
+    } else if (image_det_aug_default_enum::kShrink == param_.resize_mode) {
+      // try to keep original size, shrink if too large
+      float h = param_.data_shape[1];
+      float w = param_.data_shape[2];
+      if (res.rows > h || res.cols > w) {
+        float ratio = std::min(h / res.rows, w / res.cols);
+        int new_height = ratio * res.rows;
+        int new_width = ratio * res.cols;
+        int interpolation_method = GetInterMethod(param_.inter_method,
+                     res.cols, res.rows, new_width, new_height, prnd);
+        cv::resize(res, res, cv::Size(new_width, new_height),
+                    0, 0, interpolation_method);
+      }
+    } else if (image_det_aug_default_enum::kFit == param_.resize_mode) {
+      float h = param_.data_shape[1];
+      float w = param_.data_shape[2];
+      float ratio = std::min(h / res.rows, w / res.cols);
+      int new_height = ratio * res.rows;
+      int new_width = ratio * res.cols;
+      int interpolation_method = GetInterMethod(param_.inter_method,
+                   res.cols, res.rows, new_width, new_height, prnd);
+      cv::resize(res, res, cv::Size(new_width, new_height),
+                  0, 0, interpolation_method);
+    }
+
+    *label = det_label.ToArray();  // put back processed labels
+    return res;
+  }
+
+ private:
+  // temporal space
+  cv::Mat temp_;
+  // parameters
+  DefaultImageDetAugmentParam param_;
+};
+
+MXNET_REGISTER_IMAGE_AUGMENTER(det_aug_default)
+.describe("default detection augmenter")
+.set_body([]() {
+    return new DefaultImageDetAugmenter();
+  });
+#endif  // MXNET_USE_OPENCV
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
new file mode 100644
index 000000000000..6885718bc8da
--- /dev/null
+++ b/src/io/image_iter_common.h
@@ -0,0 +1,307 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file image_iter_common.h
+ * \brief common types used by image data iterators
+ */
+
+#ifndef MXNET_IO_IMAGE_ITER_COMMON_H_
+#define MXNET_IO_IMAGE_ITER_COMMON_H_
+
+#include <mxnet/io.h>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+namespace mxnet {
+namespace io {
+/*! \brief data structure to hold labels for images */
+class ImageLabelMap {
+ public:
+  /*!
+   * \brief initialize the label list into memory
+   * \param path_imglist path to the image list
+   * \param label_width predefined label_width
+   */
+  explicit ImageLabelMap(const char *path_imglist,
+                         mshadow::index_t label_width,
+                         bool silent) {
+    this->label_width = label_width;
+    image_index_.clear();
+    label_.clear();
+    idx2label_.clear();
+    dmlc::InputSplit *fi = dmlc::InputSplit::Create
+        (path_imglist, 0, 1, "text");
+    dmlc::InputSplit::Blob rec;
+    while (fi->NextRecord(&rec)) {
+      // quick manual parsing
+      char *p = reinterpret_cast<char*>(rec.dptr);
+      char *end = p + rec.size;
+      // skip space
+      while (isspace(*p) && p != end) ++p;
+      image_index_.push_back(static_cast<size_t>(atol(p)));
+      for (size_t i = 0; i < label_width; ++i) {
+        // skip till space
+        while (!isspace(*p) && p != end) ++p;
+        // skip space
+        while (isspace(*p) && p != end) ++p;
+        CHECK(p != end) << "Bad ImageList format";
+        label_.push_back(static_cast<real_t>(atof(p)));
+      }
+    }
+    delete fi;
+    // be careful not to resize label_ afterwards
+    idx2label_.reserve(image_index_.size());
+    for (size_t i = 0; i < image_index_.size(); ++i) {
+      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width;
+    }
+    if (!silent) {
+      LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
+                << image_index_.size() << " Image records";
+    }
+  }
+  /*! \brief find a label for corresponding index */
+  inline mshadow::Tensor<cpu, 1> Find(size_t imid) const {
+    std::unordered_map<size_t, real_t*>::const_iterator it
+        = idx2label_.find(imid);
+    CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
+    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width));
+  }
+
+ private:
+  // label with_
+  mshadow::index_t label_width;
+  // image index of each record
+  std::vector<size_t> image_index_;
+  // real label content
+  std::vector<real_t> label_;
+  // map index to label
+  std::unordered_map<size_t, real_t*> idx2label_;
+};
+
+// Define image record parser parameters
+struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
+  /*! \brief path to image list */
+  std::string path_imglist;
+  /*! \brief path to image recordio */
+  std::string path_imgrec;
+  /*! \brief a sequence of names of image augmenters, seperated by , */
+  std::string aug_seq;
+  /*! \brief label-width */
+  int label_width;
+  /*! \brief input shape */
+  TShape data_shape;
+  /*! \brief number of threads */
+  int preprocess_threads;
+  /*! \brief whether to remain silent */
+  bool verbose;
+  /*! \brief partition the data into multiple parts */
+  int num_parts;
+  /*! \brief the index of the part will read*/
+  int part_index;
+  /*! \brief the size of a shuffle chunk*/
+  size_t shuffle_chunk_size;
+  /*! \brief the seed for chunk shuffling*/
+  int shuffle_chunk_seed;
+
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
+    DMLC_DECLARE_FIELD(path_imglist).set_default("")
+        .describe("Path to the image list file");
+    DMLC_DECLARE_FIELD(path_imgrec).set_default("")
+        .describe("Filename of the image RecordIO file or a directory path.");
+    DMLC_DECLARE_FIELD(aug_seq).set_default("aug_default")
+        .describe("The augmenter names to represent"\
+                  " sequence of augmenters to be applied, seperated by comma." \
+                  " Additional keyword parameters will be seen by these augmenters.");
+    DMLC_DECLARE_FIELD(label_width).set_lower_bound(1).set_default(1)
+        .describe("The number of labels per image.");
+    DMLC_DECLARE_FIELD(data_shape)
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("The shape of one output image.");
+    DMLC_DECLARE_FIELD(preprocess_threads).set_lower_bound(1).set_default(4)
+        .describe("The number of threads.");
+    DMLC_DECLARE_FIELD(verbose).set_default(true)
+        .describe("If or not output verbose information.");
+    DMLC_DECLARE_FIELD(num_parts).set_default(1)
+        .describe("Virtual partition data into *n* parts");
+    DMLC_DECLARE_FIELD(part_index).set_default(0)
+        .describe("The *i*-th virtual partition will read");
+    DMLC_DECLARE_FIELD(shuffle_chunk_size).set_default(0)
+        .describe("The data shuffle buffer size in MB. Only valid if shuffle is true");
+    DMLC_DECLARE_FIELD(shuffle_chunk_seed).set_default(0)
+        .describe("The random seed for shuffling");
+  }
+};
+
+// Batch parameters
+struct BatchParam : public dmlc::Parameter<BatchParam> {
+  /*! \brief label width */
+  index_t batch_size;
+  /*! \brief use round roubin to handle overflow batch */
+  bool round_batch;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchParam) {
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
+    DMLC_DECLARE_FIELD(round_batch).set_default(true)
+        .describe("If or not use round robin to handle overflow batch.");
+  }
+};
+
+// Define image record parameters
+struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
+  /*! \brief whether to do shuffle */
+  bool shuffle;
+  /*! \brief random seed */
+  int seed;
+  /*! \brief whether to remain silent */
+  bool verbose;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecordParam) {
+    DMLC_DECLARE_FIELD(shuffle).set_default(false)
+        .describe("If or not randomly shuffle data.");
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("The random seed.");
+    DMLC_DECLARE_FIELD(verbose).set_default(true)
+        .describe("If or not output verbose information.");
+  }
+};
+
+// normalize parameters
+struct ImageNormalizeParam :  public dmlc::Parameter<ImageNormalizeParam> {
+  /*! \brief random seed */
+  int seed;
+  /*! \brief whether to mirror the image */
+  bool mirror;
+  /*! \brief whether to perform rand mirror the image */
+  bool rand_mirror;
+  /*! \brief mean file string */
+  std::string mean_img;
+  /*! \brief mean value for r channel */
+  float mean_r;
+  /*! \brief mean value for g channel */
+  float mean_g;
+  /*! \brief mean value for b channel */
+  float mean_b;
+  /*! \brief mean value for alpha channel */
+  float mean_a;
+  /*! \brief scale on color space */
+  float scale;
+  /*! \brief maximum ratio of contrast variation */
+  float max_random_contrast;
+  /*! \brief maximum value of illumination variation */
+  float max_random_illumination;
+  /*! \brief silent */
+  bool verbose;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageNormalizeParam) {
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("The Random Seed.");
+    DMLC_DECLARE_FIELD(mirror).set_default(false)
+        .describe("If or not mirror the image.");
+    DMLC_DECLARE_FIELD(rand_mirror).set_default(false)
+        .describe("If or not randomly the image.");
+    DMLC_DECLARE_FIELD(mean_img).set_default("")
+        .describe("Filename of the The mean image.");
+    DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
+        .describe("The mean value to be subtracted on the R channel");
+    DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
+        .describe("The mean value to be subtracted on the G channel");
+    DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
+        .describe("The mean value to be subtracted on the B channel");
+    DMLC_DECLARE_FIELD(mean_a).set_default(0.0f)
+        .describe("The mean value to be subtracted on the alpha channel");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Multiply the image with a scale value.");
+    DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
+        .describe("Change the contrast with a value randomly chosen from "
+                  "``[-max_random_contrast, max_random_contrast]``");
+    DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
+        .describe("Change the illumination with a value randomly chosen from "
+                  "``[-max_random_illumination, max_random_illumination]``");
+    DMLC_DECLARE_FIELD(verbose).set_default(true)
+        .describe("If or not output verbose information.");
+  }
+};
+
+// normalize det parameters
+struct ImageDetNormalizeParam :  public dmlc::Parameter<ImageDetNormalizeParam> {
+  /*! \brief random seed */
+  int seed;
+  /*! \brief mean file string */
+  std::string mean_img;
+  /*! \brief mean value for r channel */
+  float mean_r;
+  /*! \brief mean value for g channel */
+  float mean_g;
+  /*! \brief mean value for b channel */
+  float mean_b;
+  /*! \brief mean value for alpha channel */
+  float mean_a;
+  /*! \brief standard deviation for r channel */
+  float std_r;
+  /*! \brief standard deviation for g channel */
+  float std_g;
+  /*! \brief standard deviation for b channel */
+  float std_b;
+  /*! \brief standard deviation for alpha channel */
+  float std_a;
+  /*! \brief scale on color space */
+  float scale;
+  /*! \brief silent */
+  bool verbose;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageDetNormalizeParam) {
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Augmentation Param: Random Seed.");
+    DMLC_DECLARE_FIELD(mean_img).set_default("")
+        .describe("Augmentation Param: Mean Image to be subtracted.");
+    DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
+        .describe("Augmentation Param: Mean value on R channel.");
+    DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
+        .describe("Augmentation Param: Mean value on G channel.");
+    DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
+        .describe("Augmentation Param: Mean value on B channel.");
+    DMLC_DECLARE_FIELD(mean_a).set_default(0.0f)
+        .describe("Augmentation Param: Mean value on Alpha channel.");
+    DMLC_DECLARE_FIELD(std_r).set_default(0.0f)
+        .describe("Augmentation Param: Standard deviation on R channel.");
+    DMLC_DECLARE_FIELD(std_g).set_default(0.0f)
+        .describe("Augmentation Param: Standard deviation on G channel.");
+    DMLC_DECLARE_FIELD(std_b).set_default(0.0f)
+        .describe("Augmentation Param: Standard deviation on B channel.");
+    DMLC_DECLARE_FIELD(std_a).set_default(0.0f)
+        .describe("Augmentation Param: Standard deviation on Alpha channel.");
+    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
+        .describe("Augmentation Param: Scale in color space.");
+    DMLC_DECLARE_FIELD(verbose).set_default(true)
+        .describe("Augmentation Param: Whether to print augmentor info.");
+  }
+};
+
+// Define prefetcher parameters
+struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
+  /*! \brief number of prefetched batches */
+  size_t prefetch_buffer;
+  /*! \brief data type */
+  dmlc::optional<int> dtype;
+
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(PrefetcherParam) {
+    DMLC_DECLARE_FIELD(prefetch_buffer).set_default(4)
+        .describe("Maximal Number of batches to prefetch");
+    DMLC_DECLARE_FIELD(dtype)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("int32", mshadow::kInt32)
+      .add_enum("uint8", mshadow::kUint8)
+      .set_default(dmlc::optional<int>())
+      .describe("Output data type. None means no change");
+  }
+};
+
+}  // namespace io
+}  // namespace mxnet
+
+#endif  // MXNET_IO_IMAGE_ITER_COMMON_H_
diff --git a/src/io/io.cc b/src/io/io.cc
index 4251a96909c1..822f66f47453 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -3,9 +3,7 @@
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
 #include "./image_augmenter.h"
-#include "./iter_normalize.h"
-#include "./iter_batchloader.h"
-#include "./iter_prefetcher.h"
+#include "./image_iter_common.h"
 
 // Registers
 namespace dmlc {
@@ -18,5 +16,8 @@ namespace io {
 DMLC_REGISTER_PARAMETER(BatchParam);
 DMLC_REGISTER_PARAMETER(PrefetcherParam);
 DMLC_REGISTER_PARAMETER(ImageNormalizeParam);
+DMLC_REGISTER_PARAMETER(ImageRecParserParam);
+DMLC_REGISTER_PARAMETER(ImageRecordParam);
+DMLC_REGISTER_PARAMETER(ImageDetNormalizeParam);
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 494d2d8770d3..2b53393679c6 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -14,23 +14,10 @@
 #include <vector>
 #include <string>
 #include "./inst_vector.h"
+#include "./image_iter_common.h"
 
 namespace mxnet {
 namespace io {
-// Batch parameters
-struct BatchParam : public dmlc::Parameter<BatchParam> {
-  /*! \brief label width */
-  index_t batch_size;
-  /*! \brief use round roubin to handle overflow batch */
-  bool round_batch;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(BatchParam) {
-    DMLC_DECLARE_FIELD(batch_size)
-        .describe("Batch Param: Batch size.");
-    DMLC_DECLARE_FIELD(round_batch).set_default(true)
-        .describe("Batch Param: Use round robin to handle overflow batch.");
-  }
-};
 
 /*! \brief create a batch iterator from single instance iterator */
 class BatchLoader : public IIterator<TBlobBatch> {
diff --git a/src/io/iter_csv.cc b/src/io/iter_csv.cc
index 6385b22fb757..391a8d449c96 100644
--- a/src/io/iter_csv.cc
+++ b/src/io/iter_csv.cc
@@ -26,14 +26,15 @@ struct CSVIterParam : public dmlc::Parameter<CSVIterParam> {
   // declare parameters
   DMLC_DECLARE_PARAMETER(CSVIterParam) {
     DMLC_DECLARE_FIELD(data_csv)
-        .describe("Dataset Param: Data csv path.");
+        .describe("The filename of a CSV file or a directory path");
     DMLC_DECLARE_FIELD(data_shape)
-        .describe("Dataset Param: Shape of the data.");
+        .describe("The shape of one example");
     DMLC_DECLARE_FIELD(label_csv).set_default("NULL")
-        .describe("Dataset Param: Label csv path. If is NULL, all labels will be returned as 0");
+        .describe("The filename of a CSV file or a directory path. "
+                  "If NULL, all labels will be returned as 0");
     index_t shape1[] = {1};
     DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1))
-        .describe("Dataset Param: Shape of the label.");
+        .describe("The shape of one label.");
   }
 };
 
@@ -129,8 +130,34 @@ class CSVIter: public IIterator<DataInst> {
 DMLC_REGISTER_PARAMETER(CSVIterParam);
 
 MXNET_REGISTER_IO_ITER(CSVIter)
-.describe("Create iterator for dataset in csv.")
+.describe(R"code(Iterating on CSV files
+
+Assume there is CSV file at ``data/data.csv`` with content::
+
+  1,2,3
+  2,3,4
+  3,4,5
+  4,5,6
+
+If we set::
+
+  data_csv = 'data/data.csv'
+  data_shape = (3,)
+  batch_size = 2
+
+Then this iterator will reads two batches::
+
+  [[ 1.  2.  3.]
+   [ 2.  3.  4.]]
+  [[ 3.  4.  5.]
+   [ 4.  5.  6.]]
+
+If set ``data_csv = 'data/'``, then all files in this directory will be read.
+
+)code" ADD_FILELINE)
 .add_arguments(CSVIterParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
 .set_body([]() {
     return new PrefetcherIter(
         new BatchLoader(
diff --git a/src/io/iter_image_det_recordio.cc b/src/io/iter_image_det_recordio.cc
new file mode 100644
index 000000000000..25e920d77c13
--- /dev/null
+++ b/src/io/iter_image_det_recordio.cc
@@ -0,0 +1,578 @@
+/*!
+ *  Copyright (c) 2015 by Contributors
+ * \file iter_image_recordio-inl.hpp
+ * \brief recordio data iterator
+ */
+#include <mxnet/io.h>
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <dmlc/omp.h>
+#include <dmlc/common.h>
+#include <dmlc/input_split_shuffle.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/recordio.h>
+#include <dmlc/threadediter.h>
+#include <unordered_map>
+#include <vector>
+#include <cstdlib>
+#include "./inst_vector.h"
+#include "./image_recordio.h"
+#include "./image_augmenter.h"
+#include "./image_iter_common.h"
+#include "./iter_prefetcher.h"
+#include "./iter_normalize.h"
+#include "./iter_batchloader.h"
+
+namespace mxnet {
+namespace io {
+/*! \brief data structure to hold labels for image detection tasks
+ *  support arbitrary label_width
+ */
+class ImageDetLabelMap {
+ public:
+  /*!
+   * \brief initialize the label list into memory
+   * \param path_imglist path to the image list
+   * \param label_width predefined label_width, -1 for arbitrary width
+   */
+  explicit ImageDetLabelMap(const char *path_imglist,
+                            int label_width,
+                            bool silent) {
+    image_index_.clear();
+    label_.clear();
+    idx2label_.clear();
+    dmlc::InputSplit *fi = dmlc::InputSplit::Create
+        (path_imglist, 0, 1, "text");
+    dmlc::InputSplit::Blob rec;
+    while (fi->NextRecord(&rec)) {
+      // quick manual parsing
+      char *p = reinterpret_cast<char*>(rec.dptr);
+      char *end = p + rec.size;
+      // skip space
+      while (isspace(*p) && p != end) ++p;
+      image_index_.push_back(static_cast<size_t>(atol(p)));
+      size_t start_pos = label_.size();
+      if (label_width > 0) {
+        // provided label_width > 0, require width check
+        for (int i = 0; i < label_width; ++i) {
+          // skip till space
+          while (!isspace(*p) && p != end) ++p;
+          // skip space
+          while (isspace(*p) && p != end) ++p;
+          CHECK(p != end) << "Bad ImageList format";
+          label_.push_back(static_cast<real_t>(atof(p)));
+        }
+        CHECK_EQ(label_.size() - start_pos, label_width);
+      } else {
+        // arbitrary label width for each sample
+        while (!isspace(*p) && p != end) ++p;
+        while (isspace(*p) && p != end) ++p;
+        char *curr = p;
+        CHECK(curr != end) << "Bad ImageList format";
+        while (!isspace(*p) && p != end) ++p;
+        while (isspace(*p) && p != end) ++p;
+        char *next = p;
+        while (next != end) {
+          label_.push_back(static_cast<real_t>(atof(curr)));
+          curr = next;
+          while (!isspace(*next) && next != end) ++next;
+          while (isspace(*next) && next != end) ++next;
+        }
+        // skip the last one which should be the image_path
+        CHECK_GT(label_.size(), start_pos) << "Bad ImageList format: empty label";
+      }
+      // record label start_pos and width in map
+      idx2label_[image_index_.back()] = std::pair<size_t, size_t>(
+        start_pos, label_.size() - start_pos);
+    }
+    delete fi;
+    if (!silent) {
+      LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
+                << image_index_.size() << " Image records";
+    }
+  }
+
+  /*! \brief find a label for corresponding index, return vector as copy */
+  inline std::vector<float> FindCopy(size_t imid) const {
+    std::unordered_map<size_t, std::pair<size_t, size_t> >::const_iterator it
+        = idx2label_.find(imid);
+    CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
+    const real_t *ptr = dmlc::BeginPtr(label_) + it->second.first;
+    return std::vector<float>(ptr, ptr + it->second.second);
+  }
+
+  /*! \brief Iterate through all labels, find the Maximum width of labels */
+  inline size_t MaxLabelWidth() const {
+    size_t max_width = 0;
+    for (auto i : idx2label_) {
+      size_t width = i.second.second;
+      if (width > max_width) max_width = width;
+    }
+    return max_width;
+  }
+
+ private:
+  /*! \brief vector storing image indices */
+  std::vector<size_t> image_index_;
+  /*! \brief vectors storing raw labels in 1D */
+  std::vector<real_t> label_;
+  /*! \brief map storing image index to pair<label_start_pos, label_end_pos> */
+  std::unordered_map<size_t, std::pair<size_t, size_t> > idx2label_;
+};  // class ImageDetLabelMap
+
+// Define image record parser parameters
+struct ImageDetRecParserParam : public dmlc::Parameter<ImageDetRecParserParam> {
+  /*! \brief path to image list */
+  std::string path_imglist;
+  /*! \brief path to image recordio */
+  std::string path_imgrec;
+  /*! \brief a sequence of names of image augmenters, seperated by , */
+  std::string aug_seq;
+  /*! \brief label-width, use -1 for variable width */
+  int label_width;
+  /*! \brief input shape */
+  TShape data_shape;
+  /*! \brief number of threads */
+  int preprocess_threads;
+  /*! \brief whether to remain silent */
+  bool verbose;
+  /*! \brief partition the data into multiple parts */
+  int num_parts;
+  /*! \brief the index of the part will read*/
+  int part_index;
+  /*! \brief the size of a shuffle chunk*/
+  size_t shuffle_chunk_size;
+  /*! \brief the seed for chunk shuffling*/
+  int shuffle_chunk_seed;
+  /*! \brief pad label to specified length, -1 for auto estimate in whole dataset */
+  int label_pad_width;
+  /*! \brief labe padding value */
+  float label_pad_value;
+
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageDetRecParserParam) {
+    DMLC_DECLARE_FIELD(path_imglist).set_default("")
+        .describe("Dataset Param: Path to image list.");
+    DMLC_DECLARE_FIELD(path_imgrec).set_default("./data/imgrec.rec")
+        .describe("Dataset Param: Path to image record file.");
+    DMLC_DECLARE_FIELD(aug_seq).set_default("det_aug_default")
+        .describe("Augmentation Param: the augmenter names to represent"\
+                  " sequence of augmenters to be applied, seperated by comma." \
+                  " Additional keyword parameters will be seen by these augmenters."
+                  " Make sure you don't use normal augmenters for detection tasks.");
+    DMLC_DECLARE_FIELD(label_width).set_default(-1)
+        .describe("Dataset Param: How many labels for an image, -1 for variable label size.");
+    DMLC_DECLARE_FIELD(data_shape)
+        .set_expect_ndim(3).enforce_nonzero()
+        .describe("Dataset Param: Shape of each instance generated by the DataIter.");
+    DMLC_DECLARE_FIELD(preprocess_threads).set_lower_bound(1).set_default(4)
+        .describe("Backend Param: Number of thread to do preprocessing.");
+    DMLC_DECLARE_FIELD(verbose).set_default(true)
+        .describe("Auxiliary Param: Whether to output parser information.");
+    DMLC_DECLARE_FIELD(num_parts).set_default(1)
+        .describe("partition the data into multiple parts");
+    DMLC_DECLARE_FIELD(part_index).set_default(0)
+        .describe("the index of the part will read");
+    DMLC_DECLARE_FIELD(shuffle_chunk_size).set_default(0)
+        .describe("the size(MB) of the shuffle chunk, used with shuffle=True,"\
+                  " it can enable global shuffling");
+    DMLC_DECLARE_FIELD(shuffle_chunk_seed).set_default(0)
+        .describe("the seed for chunk shuffling");
+    DMLC_DECLARE_FIELD(label_pad_width).set_default(0)
+        .describe("pad output label width if set larger than 0, -1 for auto estimate");
+    DMLC_DECLARE_FIELD(label_pad_value).set_default(-1.f)
+        .describe("label padding value if enabled");
+  }
+};
+
+// parser to parse image recordio
+template<typename DType>
+class ImageDetRecordIOParser {
+ public:
+  // initialize the parser
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
+
+  // set record to the head
+  inline void BeforeFirst(void) {
+    return source_->BeforeFirst();
+  }
+  // parse next set of records, return an array of
+  // instance vector to the user
+  virtual inline bool ParseNext(std::vector<InstVector<DType>> *out);
+
+ protected:
+  // magic number to see prng
+  static const int kRandMagic = 233;
+  /*! \brief parameters */
+  ImageDetRecParserParam param_;
+  #if MXNET_USE_OPENCV
+  /*! \brief augmenters */
+  std::vector<std::vector<std::unique_ptr<ImageAugmenter> > > augmenters_;
+  #endif
+  /*! \brief random samplers */
+  std::vector<std::unique_ptr<common::RANDOM_ENGINE> > prnds_;
+  /*! \brief data source */
+  std::unique_ptr<dmlc::InputSplit> source_;
+  /*! \brief label information, if any */
+  std::unique_ptr<ImageDetLabelMap> label_map_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
+};
+
+template<typename DType>
+inline void ImageDetRecordIOParser<DType>::Init(
+    const std::vector<std::pair<std::string, std::string> >& kwargs) {
+#if MXNET_USE_OPENCV
+  // initialize parameter
+  // init image rec param
+  param_.InitAllowUnknown(kwargs);
+  int maxthread, threadget;
+  #pragma omp parallel
+  {
+    // be conservative, set number of real cores - 1
+    maxthread = std::max(omp_get_num_procs() - 1, 1);
+  }
+  param_.preprocess_threads = std::min(maxthread, param_.preprocess_threads);
+  #pragma omp parallel num_threads(param_.preprocess_threads)
+  {
+    threadget = omp_get_num_threads();
+  }
+  param_.preprocess_threads = threadget;
+
+  std::vector<std::string> aug_names = dmlc::Split(param_.aug_seq, ',');
+  augmenters_.clear();
+  augmenters_.resize(threadget);
+  // setup decoders
+  for (int i = 0; i < threadget; ++i) {
+    for (const auto& aug_name : aug_names) {
+      augmenters_[i].emplace_back(ImageAugmenter::Create(aug_name));
+      augmenters_[i].back()->Init(kwargs);
+    }
+    prnds_.emplace_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
+  }
+  if (param_.path_imglist.length() != 0) {
+    label_map_.reset(new ImageDetLabelMap(param_.path_imglist.c_str(),
+      param_.label_width, !param_.verbose));
+  }
+  CHECK(param_.path_imgrec.length() != 0)
+      << "ImageDetRecordIOIterator: must specify image_rec";
+
+  if (param_.verbose) {
+    LOG(INFO) << "ImageDetRecordIOParser: " << param_.path_imgrec
+              << ", use " << threadget << " threads for decoding..";
+  }
+  source_.reset(dmlc::InputSplit::Create(
+      param_.path_imgrec.c_str(), param_.part_index,
+      param_.num_parts, "recordio"));
+
+  // estimate padding width for labels
+  int max_label_width = 0;
+  if (label_map_ != nullptr) {
+    max_label_width = label_map_->MaxLabelWidth();
+  } else {
+    // iterate through recordio
+    dmlc::InputSplit::Blob chunk;
+    while (source_->NextChunk(&chunk)) {
+      #pragma omp parallel num_threads(param_.preprocess_threads)
+      {
+        CHECK(omp_get_num_threads() == param_.preprocess_threads);
+        int max_width = 0;
+        int tid = omp_get_thread_num();
+        dmlc::RecordIOChunkReader reader(chunk, tid, param_.preprocess_threads);
+        ImageRecordIO rec;
+        dmlc::InputSplit::Blob blob;
+        while (reader.NextRecord(&blob)) {
+          rec.Load(blob.dptr, blob.size);
+          if (rec.label != NULL) {
+            if (param_.label_width > 0) {
+              CHECK_EQ(param_.label_width, rec.num_label)
+                << "rec file provide " << rec.num_label << "-dimensional label "
+                   "but label_width is set to " << param_.label_width;
+            }
+            // update max value
+            max_width = std::max(max_width, rec.num_label);
+          } else {
+            LOG(FATAL) << "Not enough label packed in img_list or rec file.";
+          }
+        }
+        #pragma omp critical
+        {
+          max_label_width = std::max(max_label_width, max_width);
+        }
+      }
+    }
+  }
+  if (max_label_width > param_.label_pad_width) {
+    if (param_.label_pad_width > 0) {
+      LOG(FATAL) << "ImageDetRecordIOParser: label_pad_width: "
+        << param_.label_pad_width << " smaller than estimated width: "
+        << max_label_width;
+    }
+    param_.label_pad_width = max_label_width;
+  }
+  if (param_.verbose) {
+    LOG(INFO) << "ImageDetRecordIOParser: " << param_.path_imgrec
+              << ", label padding width: " << param_.label_pad_width;
+  }
+
+  source_.reset(dmlc::InputSplit::Create(
+      param_.path_imgrec.c_str(), param_.part_index,
+      param_.num_parts, "recordio"));
+
+  if (param_.shuffle_chunk_size > 0) {
+    if (param_.shuffle_chunk_size > 4096) {
+      LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
+                 << " MB which is larger than 4096 MB, please set "
+                    "smaller chunk size";
+    }
+    if (param_.shuffle_chunk_size < 4) {
+      LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
+                 << " MB which is less than 4 MB, please set "
+                    "larger chunk size";
+    }
+    // 1.1 ratio is for a bit more shuffle parts to avoid boundary issue
+    unsigned num_shuffle_parts =
+        std::ceil(source_->GetTotalSize() * 1.1 /
+                  (param_.num_parts * (param_.shuffle_chunk_size << 20UL)));
+
+    if (num_shuffle_parts > 1) {
+      source_.reset(dmlc::InputSplitShuffle::Create(
+          param_.path_imgrec.c_str(), param_.part_index,
+          param_.num_parts, "recordio", num_shuffle_parts, param_.shuffle_chunk_seed));
+    }
+    source_->HintChunkSize(param_.shuffle_chunk_size << 17UL);
+  } else {
+    // use 64 MB chunk when possible
+    source_->HintChunkSize(8 << 20UL);
+  }
+#else
+  LOG(FATAL) << "ImageDetRec need opencv to process";
+#endif
+}
+
+template<typename DType>
+inline bool ImageDetRecordIOParser<DType>::
+ParseNext(std::vector<InstVector<DType>> *out_vec) {
+  CHECK(source_ != nullptr);
+  dmlc::InputSplit::Blob chunk;
+  if (!source_->NextChunk(&chunk)) return false;
+#if MXNET_USE_OPENCV
+  // save opencv out
+  out_vec->resize(param_.preprocess_threads);
+  #pragma omp parallel num_threads(param_.preprocess_threads)
+  {
+    CHECK(omp_get_num_threads() == param_.preprocess_threads);
+    int tid = omp_get_thread_num();
+    dmlc::RecordIOChunkReader reader(chunk, tid, param_.preprocess_threads);
+    ImageRecordIO rec;
+    dmlc::InputSplit::Blob blob;
+    // image data
+    InstVector<DType> &out = (*out_vec)[tid];
+    out.Clear();
+    while (reader.NextRecord(&blob)) {
+      // Opencv decode and augments
+      cv::Mat res;
+      rec.Load(blob.dptr, blob.size);
+      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+      switch (param_.data_shape[0]) {
+       case 1:
+        res = cv::imdecode(buf, 0);
+        break;
+       case 3:
+        res = cv::imdecode(buf, 1);
+        break;
+       case 4:
+        // -1 to keep the number of channel of the encoded image, and not force gray or color.
+        res = cv::imdecode(buf, -1);
+        CHECK_EQ(res.channels(), 4)
+          << "Invalid image with index " << rec.image_index()
+          << ". Expected 4 channels, got " << res.channels();
+        break;
+       default:
+        LOG(FATAL) << "Invalid output shape " << param_.data_shape;
+      }
+      const int n_channels = res.channels();
+      // load label before augmentations
+      std::vector<float> label_buf;
+      if (this->label_map_ != nullptr) {
+        label_buf = label_map_->FindCopy(rec.image_index());
+      } else if (rec.label != NULL) {
+        if (param_.label_width > 0) {
+          CHECK_EQ(param_.label_width, rec.num_label)
+            << "rec file provide " << rec.num_label << "-dimensional label "
+               "but label_width is set to " << param_.label_width;
+        }
+        label_buf.assign(rec.label, rec.label + rec.num_label);
+      } else {
+        LOG(FATAL) << "Not enough label packed in img_list or rec file.";
+      }
+      for (auto& aug : this->augmenters_[tid]) {
+        res = aug->Process(res, &label_buf, this->prnds_[tid].get());
+      }
+      out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(n_channels, param_.data_shape[1], param_.data_shape[2]),
+               mshadow::Shape1(param_.label_pad_width + 4));
+
+      mshadow::Tensor<cpu, 3, DType> data = out.data().Back();
+
+      // For RGB or RGBA data, swap the B and R channel:
+      // OpenCV store as BGR (or BGRA) and we want RGB (or RGBA)
+      std::vector<int> swap_indices;
+      if (n_channels == 1) swap_indices = {0};
+      if (n_channels == 3) swap_indices = {2, 1, 0};
+      if (n_channels == 4) swap_indices = {2, 1, 0, 3};
+
+      for (int i = 0; i < res.rows; ++i) {
+        uchar* im_data = res.ptr<uchar>(i);
+        for (int j = 0; j < res.cols; ++j) {
+          for (int k = 0; k < n_channels; ++k) {
+              data[k][i][j] = im_data[swap_indices[k]];
+          }
+          im_data += n_channels;
+        }
+      }
+      mshadow::Tensor<cpu, 1> label = out.label().Back();
+      label = param_.label_pad_value;
+      // store info for real data_shape and label_width
+      label[0] = res.channels();
+      label[1] = res.rows;
+      label[2] = res.cols;
+      label[3] = label_buf.size();
+      mshadow::Copy(label.Slice(4, 4 + label_buf.size()),
+        mshadow::Tensor<cpu, 1>(dmlc::BeginPtr(label_buf),
+        mshadow::Shape1(label_buf.size())));
+      res.release();
+    }
+  }
+#else
+      LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+#endif
+  return true;
+}
+
+// Define image record parameters
+struct ImageDetRecordParam: public dmlc::Parameter<ImageDetRecordParam> {
+  /*! \brief whether to do shuffle */
+  bool shuffle;
+  /*! \brief random seed */
+  int seed;
+  /*! \brief whether to remain silent */
+  bool verbose;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageDetRecordParam) {
+    DMLC_DECLARE_FIELD(shuffle).set_default(false)
+        .describe("Augmentation Param: Whether to shuffle data.");
+    DMLC_DECLARE_FIELD(seed).set_default(0)
+        .describe("Augmentation Param: Random Seed.");
+    DMLC_DECLARE_FIELD(verbose).set_default(true)
+        .describe("Auxiliary Param: Whether to output information.");
+  }
+};
+
+// iterator on image recordio
+template<typename DType = real_t>
+class ImageDetRecordIter : public IIterator<DataInst> {
+ public:
+  ImageDetRecordIter() : data_(nullptr) { }
+  // destructor
+  virtual ~ImageDetRecordIter(void) {
+    iter_.Destroy();
+    delete data_;
+  }
+  // constructor
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    // use the kwarg to init parser
+    parser_.Init(kwargs);
+    // prefetch at most 4 minbatches
+    iter_.set_max_capacity(4);
+    // init thread iter
+    iter_.Init([this](std::vector<InstVector<DType>> **dptr) {
+        if (*dptr == nullptr) {
+          *dptr = new std::vector<InstVector<DType>>();
+        }
+        return parser_.ParseNext(*dptr);
+      },
+      [this]() { parser_.BeforeFirst(); });
+    inst_ptr_ = 0;
+    rnd_.seed(kRandMagic + param_.seed);
+  }
+  // before first
+  virtual void BeforeFirst(void) {
+    iter_.BeforeFirst();
+    inst_order_.clear();
+    inst_ptr_ = 0;
+  }
+
+  virtual bool Next(void) {
+    while (true) {
+      if (inst_ptr_ < inst_order_.size()) {
+        std::pair<unsigned, unsigned> p = inst_order_[inst_ptr_];
+        out_ = (*data_)[p.first][p.second];
+        ++inst_ptr_;
+        return true;
+      } else {
+        if (data_ != nullptr) iter_.Recycle(&data_);
+        if (!iter_.Next(&data_)) return false;
+        inst_order_.clear();
+        for (unsigned i = 0; i < data_->size(); ++i) {
+          const InstVector<DType>& tmp = (*data_)[i];
+          for (unsigned j = 0; j < tmp.Size(); ++j) {
+            inst_order_.push_back(std::make_pair(i, j));
+          }
+        }
+        // shuffle instance order if needed
+        if (param_.shuffle != 0) {
+          std::shuffle(inst_order_.begin(), inst_order_.end(), rnd_);
+        }
+        inst_ptr_ = 0;
+      }
+    }
+    return false;
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  // random magic
+  static const int kRandMagic = 233;
+  // output instance
+  DataInst out_;
+  // data ptr
+  size_t inst_ptr_;
+  // internal instance order
+  std::vector<std::pair<unsigned, unsigned> > inst_order_;
+  // data
+  std::vector<InstVector<DType>> *data_;
+  // internal parser
+  ImageDetRecordIOParser<DType> parser_;
+  // backend thread
+  dmlc::ThreadedIter<std::vector<InstVector<DType>> > iter_;
+  // parameters
+  ImageDetRecordParam param_;
+  // random number generator
+  common::RANDOM_ENGINE rnd_;
+};
+
+DMLC_REGISTER_PARAMETER(ImageDetRecParserParam);
+DMLC_REGISTER_PARAMETER(ImageDetRecordParam);
+
+MXNET_REGISTER_IO_ITER(ImageDetRecordIter)
+.describe("Create iterator for image detection dataset packed in recordio.")
+.add_arguments(ImageDetRecParserParam::__FIELDS__())
+.add_arguments(ImageDetRecordParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.add_arguments(ListDefaultDetAugParams())
+.add_arguments(ImageDetNormalizeParam::__FIELDS__())
+.set_body([]() {
+  return new PrefetcherIter(
+        new BatchLoader(
+            new ImageDetNormalizeIter(
+                new ImageDetRecordIter<real_t>())));
+});
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 0dfd50b0f4bd..49694d07f1bb 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -16,6 +16,7 @@
 #include <unordered_map>
 #include <vector>
 #include <cstdlib>
+#include "./image_iter_common.h"
 #include "./inst_vector.h"
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
@@ -25,126 +26,6 @@
 
 namespace mxnet {
 namespace io {
-/*! \brief data structure to hold labels for images */
-class ImageLabelMap {
- public:
-  /*!
-   * \brief initialize the label list into memory
-   * \param path_imglist path to the image list
-   * \param label_width predefined label_width
-   */
-  explicit ImageLabelMap(const char *path_imglist,
-                         mshadow::index_t label_width,
-                         bool silent) {
-    this->label_width = label_width;
-    image_index_.clear();
-    label_.clear();
-    idx2label_.clear();
-    dmlc::InputSplit *fi = dmlc::InputSplit::Create
-        (path_imglist, 0, 1, "text");
-    dmlc::InputSplit::Blob rec;
-    while (fi->NextRecord(&rec)) {
-      // quick manual parsing
-      char *p = reinterpret_cast<char*>(rec.dptr);
-      char *end = p + rec.size;
-      // skip space
-      while (isspace(*p) && p != end) ++p;
-      image_index_.push_back(static_cast<size_t>(atol(p)));
-      for (size_t i = 0; i < label_width; ++i) {
-        // skip till space
-        while (!isspace(*p) && p != end) ++p;
-        // skip space
-        while (isspace(*p) && p != end) ++p;
-        CHECK(p != end) << "Bad ImageList format";
-        label_.push_back(static_cast<real_t>(atof(p)));
-      }
-    }
-    delete fi;
-    // be careful not to resize label_ afterwards
-    idx2label_.reserve(image_index_.size());
-    for (size_t i = 0; i < image_index_.size(); ++i) {
-      idx2label_[image_index_[i]] = dmlc::BeginPtr(label_) + i * label_width;
-    }
-    if (!silent) {
-      LOG(INFO) << "Loaded ImageList from " << path_imglist << ' '
-                << image_index_.size() << " Image records";
-    }
-  }
-  /*! \brief find a label for corresponding index */
-  inline mshadow::Tensor<cpu, 1> Find(size_t imid) const {
-    std::unordered_map<size_t, real_t*>::const_iterator it
-        = idx2label_.find(imid);
-    CHECK(it != idx2label_.end()) << "fail to find imagelabel for id " << imid;
-    return mshadow::Tensor<cpu, 1>(it->second, mshadow::Shape1(label_width));
-  }
-
- private:
-  // label with_
-  mshadow::index_t label_width;
-  // image index of each record
-  std::vector<size_t> image_index_;
-  // real label content
-  std::vector<real_t> label_;
-  // map index to label
-  std::unordered_map<size_t, real_t*> idx2label_;
-};
-
-// Define image record parser parameters
-struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
-  /*! \brief path to image list */
-  std::string path_imglist;
-  /*! \brief path to image recordio */
-  std::string path_imgrec;
-  /*! \brief a sequence of names of image augmenters, seperated by , */
-  std::string aug_seq;
-  /*! \brief label-width */
-  int label_width;
-  /*! \brief input shape */
-  TShape data_shape;
-  /*! \brief number of threads */
-  int preprocess_threads;
-  /*! \brief whether to remain silent */
-  bool verbose;
-  /*! \brief partition the data into multiple parts */
-  int num_parts;
-  /*! \brief the index of the part will read*/
-  int part_index;
-  /*! \brief the size of a shuffle chunk*/
-  size_t shuffle_chunk_size;
-  /*! \brief the seed for chunk shuffling*/
-  int shuffle_chunk_seed;
-
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(ImageRecParserParam) {
-    DMLC_DECLARE_FIELD(path_imglist).set_default("")
-        .describe("Dataset Param: Path to image list.");
-    DMLC_DECLARE_FIELD(path_imgrec).set_default("./data/imgrec.rec")
-        .describe("Dataset Param: Path to image record file.");
-    DMLC_DECLARE_FIELD(aug_seq).set_default("aug_default")
-        .describe("Augmentation Param: the augmenter names to represent"\
-                  " sequence of augmenters to be applied, seperated by comma." \
-                  " Additional keyword parameters will be seen by these augmenters.");
-    DMLC_DECLARE_FIELD(label_width).set_lower_bound(1).set_default(1)
-        .describe("Dataset Param: How many labels for an image.");
-    DMLC_DECLARE_FIELD(data_shape)
-        .set_expect_ndim(3).enforce_nonzero()
-        .describe("Dataset Param: Shape of each instance generated by the DataIter.");
-    DMLC_DECLARE_FIELD(preprocess_threads).set_lower_bound(1).set_default(4)
-        .describe("Backend Param: Number of thread to do preprocessing.");
-    DMLC_DECLARE_FIELD(verbose).set_default(true)
-        .describe("Auxiliary Param: Whether to output parser information.");
-    DMLC_DECLARE_FIELD(num_parts).set_default(1)
-        .describe("partition the data into multiple parts");
-    DMLC_DECLARE_FIELD(part_index).set_default(0)
-        .describe("the index of the part will read");
-    DMLC_DECLARE_FIELD(shuffle_chunk_size).set_default(0)
-        .describe("the size(MB) of the shuffle chunk, used with shuffle=True,"\
-                  " it can enable global shuffling");
-    DMLC_DECLARE_FIELD(shuffle_chunk_seed).set_default(0)
-        .describe("the seed for chunk shuffling");
-  }
-};
-
 // parser to parse image recordio
 template<typename DType>
 class ImageRecordIOParser {
@@ -298,7 +179,7 @@ ParseNext(std::vector<InstVector<DType>> *out_vec) {
       }
       const int n_channels = res.channels();
       for (auto& aug : augmenters_[tid]) {
-        res = aug->Process(res, prnds_[tid].get());
+        res = aug->Process(res, nullptr, prnds_[tid].get());
       }
       out.Push(static_cast<unsigned>(rec.image_index()),
                mshadow::Shape3(n_channels, res.rows, res.cols),
@@ -342,30 +223,11 @@ ParseNext(std::vector<InstVector<DType>> *out_vec) {
     }
   }
 #else
-      LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
-#endif
+  LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+#endif  // MXNET_USE_OPENCV
   return true;
 }
 
-// Define image record parameters
-struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
-  /*! \brief whether to do shuffle */
-  bool shuffle;
-  /*! \brief random seed */
-  int seed;
-  /*! \brief whether to remain silent */
-  bool verbose;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(ImageRecordParam) {
-    DMLC_DECLARE_FIELD(shuffle).set_default(false)
-        .describe("Augmentation Param: Whether to shuffle data.");
-    DMLC_DECLARE_FIELD(seed).set_default(0)
-        .describe("Augmentation Param: Random Seed.");
-    DMLC_DECLARE_FIELD(verbose).set_default(true)
-        .describe("Auxiliary Param: Whether to output information.");
-  }
-};
-
 // iterator on image recordio
 template<typename DType = real_t>
 class ImageRecordIter : public IIterator<DataInst> {
@@ -453,11 +315,17 @@ class ImageRecordIter : public IIterator<DataInst> {
   common::RANDOM_ENGINE rnd_;
 };
 
-DMLC_REGISTER_PARAMETER(ImageRecParserParam);
-DMLC_REGISTER_PARAMETER(ImageRecordParam);
+// OLD VERSION - DEPRECATED
+MXNET_REGISTER_IO_ITER(ImageRecordIter_v1)
+.describe(R"code(Iterating on image RecordIO files
+
+Read images batches from RecordIO files with a rich of data augmentation
+options.
+
+One can use ``tools/im2rec.py`` to pack individual image files into RecordIO
+files.
 
-MXNET_REGISTER_IO_ITER(ImageRecordIter)
-.describe("Create iterator for dataset packed in recordio.")
+)code" ADD_FILELINE)
 .add_arguments(ImageRecParserParam::__FIELDS__())
 .add_arguments(ImageRecordParam::__FIELDS__())
 .add_arguments(BatchParam::__FIELDS__())
@@ -471,8 +339,14 @@ MXNET_REGISTER_IO_ITER(ImageRecordIter)
                 new ImageRecordIter<real_t>())));
   });
 
-MXNET_REGISTER_IO_ITER(ImageRecordUInt8Iter)
-.describe("Create iterator for dataset packed in recordio.")
+// OLD VERSION - DEPRECATED
+MXNET_REGISTER_IO_ITER(ImageRecordUInt8Iter_v1)
+.describe(R"code(Iterating on image RecordIO files
+
+This iterator is identical to ``ImageRecordIter`` except for using ``uint8`` as
+the data type instead of ``float``.
+
+)code" ADD_FILELINE)
 .add_arguments(ImageRecParserParam::__FIELDS__())
 .add_arguments(ImageRecordParam::__FIELDS__())
 .add_arguments(BatchParam::__FIELDS__())
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
new file mode 100644
index 000000000000..674ea8a17c82
--- /dev/null
+++ b/src/io/iter_image_recordio_2.cc
@@ -0,0 +1,594 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file iter_image_recordio_2.cc
+ * \brief new version of recordio data iterator
+ */
+
+#include <mxnet/io.h>
+#include <dmlc/parameter.h>
+#include <dmlc/threadediter.h>
+#include <dmlc/input_split_shuffle.h>
+#include <dmlc/recordio.h>
+#include <dmlc/base.h>
+#include <dmlc/io.h>
+#include <dmlc/omp.h>
+#include <dmlc/common.h>
+#include <dmlc/timer.h>
+#include <type_traits>
+#include "./image_recordio.h"
+#include "./image_augmenter.h"
+#include "./image_iter_common.h"
+#include "./inst_vector.h"
+
+namespace mxnet {
+namespace io {
+// parser to parse image recordio
+template<typename DType>
+class ImageRecordIOParser2 {
+ public:
+  // initialize the parser
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs);
+
+  // set record to the head
+  inline void BeforeFirst(void) {
+    if (batch_param_.round_batch == 0 || !overflow) {
+      n_parsed_ = 0;
+      return source_->BeforeFirst();
+    } else {
+      overflow = false;
+    }
+  }
+  // parse next set of records, return an array of
+  // instance vector to the user
+  inline bool ParseNext(DataBatch *out);
+
+ private:
+  inline void ParseChunk(dmlc::InputSplit::Blob * chunk);
+  inline void CreateMeanImg(void);
+
+  // magic number to seed prng
+  static const int kRandMagic = 111;
+  static const int kRandMagicNormalize = 0;
+  /*! \brief parameters */
+  ImageRecParserParam param_;
+  ImageRecordParam record_param_;
+  BatchParam batch_param_;
+  ImageNormalizeParam normalize_param_;
+  PrefetcherParam prefetch_param_;
+  #if MXNET_USE_OPENCV
+  /*! \brief augmenters */
+  std::vector<std::vector<std::unique_ptr<ImageAugmenter> > > augmenters_;
+  #endif
+  /*! \brief random samplers */
+  std::vector<std::unique_ptr<common::RANDOM_ENGINE> > prnds_;
+  common::RANDOM_ENGINE rnd_;
+  /*! \brief data source */
+  std::unique_ptr<dmlc::InputSplit> source_;
+  /*! \brief label information, if any */
+  std::unique_ptr<ImageLabelMap> label_map_;
+  /*! \brief temporary results */
+  std::vector<InstVector<DType>> temp_;
+  /*! \brief temp space */
+  mshadow::TensorContainer<cpu, 3> img_;
+  /*! \brief internal instance order */
+  std::vector<std::pair<unsigned, unsigned> > inst_order_;
+  unsigned inst_index_;
+  /*! \brief internal counter tracking number of already parsed entries */
+  unsigned n_parsed_;
+  /*! \brief overflow marker */
+  bool overflow;
+  /*! \brief unit size */
+  std::vector<size_t> unit_size_;
+  /*! \brief mean image, if needed */
+  mshadow::TensorContainer<cpu, 3> meanimg_;
+  // whether mean image is ready.
+  bool meanfile_ready_;
+};
+
+template<typename DType>
+inline void ImageRecordIOParser2<DType>::Init(
+    const std::vector<std::pair<std::string, std::string> >& kwargs) {
+#if MXNET_USE_OPENCV
+  // initialize parameter
+  // init image rec param
+  param_.InitAllowUnknown(kwargs);
+  record_param_.InitAllowUnknown(kwargs);
+  batch_param_.InitAllowUnknown(kwargs);
+  normalize_param_.InitAllowUnknown(kwargs);
+  prefetch_param_.InitAllowUnknown(kwargs);
+  n_parsed_ = 0;
+  overflow = false;
+  rnd_.seed(kRandMagic + record_param_.seed);
+  int maxthread, threadget;
+  #pragma omp parallel
+  {
+    // be conservative, set number of real cores
+    maxthread = std::max(omp_get_num_procs() / 2 - 1, 1);
+  }
+  param_.preprocess_threads = std::min(maxthread, param_.preprocess_threads);
+  #pragma omp parallel num_threads(param_.preprocess_threads)
+  {
+    threadget = omp_get_num_threads();
+  }
+  param_.preprocess_threads = threadget;
+
+  std::vector<std::string> aug_names = dmlc::Split(param_.aug_seq, ',');
+  augmenters_.clear();
+  augmenters_.resize(threadget);
+  // setup decoders
+  for (int i = 0; i < threadget; ++i) {
+    for (const auto& aug_name : aug_names) {
+      augmenters_[i].emplace_back(ImageAugmenter::Create(aug_name));
+      augmenters_[i].back()->Init(kwargs);
+    }
+    prnds_.emplace_back(new common::RANDOM_ENGINE((i + 1) * kRandMagic));
+  }
+  if (param_.path_imglist.length() != 0) {
+    label_map_.reset(new ImageLabelMap(param_.path_imglist.c_str(),
+      param_.label_width, !param_.verbose));
+  }
+  CHECK(param_.path_imgrec.length() != 0)
+      << "ImageRecordIter2: must specify image_rec";
+
+  if (param_.verbose) {
+    LOG(INFO) << "ImageRecordIOParser2: " << param_.path_imgrec
+              << ", use " << threadget << " threads for decoding..";
+  }
+  source_.reset(dmlc::InputSplit::Create(
+      param_.path_imgrec.c_str(), param_.part_index,
+      param_.num_parts, "recordio"));
+  if (param_.shuffle_chunk_size > 0) {
+    if (param_.shuffle_chunk_size > 4096) {
+      LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
+                 << " MB which is larger than 4096 MB, please set "
+                    "smaller chunk size";
+    }
+    if (param_.shuffle_chunk_size < 4) {
+      LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
+                 << " MB which is less than 4 MB, please set "
+                    "larger chunk size";
+    }
+    // 1.1 ratio is for a bit more shuffle parts to avoid boundary issue
+    unsigned num_shuffle_parts =
+        std::ceil(source_->GetTotalSize() * 1.1 /
+                  (param_.num_parts * (param_.shuffle_chunk_size << 20UL)));
+
+    if (num_shuffle_parts > 1) {
+      source_.reset(dmlc::InputSplitShuffle::Create(
+          param_.path_imgrec.c_str(), param_.part_index,
+          param_.num_parts, "recordio", num_shuffle_parts, param_.shuffle_chunk_seed));
+    }
+    source_->HintChunkSize(param_.shuffle_chunk_size << 17UL);
+  } else {
+    // use 64 MB chunk when possible
+    source_->HintChunkSize(8 << 20UL);
+  }
+  // Normalize init
+  if (!std::is_same<DType, uint8_t>::value) {
+    meanimg_.set_pad(false);
+    meanfile_ready_ = false;
+    if (normalize_param_.mean_img.length() != 0) {
+      std::unique_ptr<dmlc::Stream> fi(
+          dmlc::Stream::Create(normalize_param_.mean_img.c_str(), "r", true));
+      if (fi.get() == nullptr) {
+        this->CreateMeanImg();
+      } else {
+        fi.reset(nullptr);
+        if (param_.verbose) {
+          LOG(INFO) << "Load mean image from " << normalize_param_.mean_img;
+        }
+        // use python compatible ndarray store format
+        std::vector<NDArray> data;
+        std::vector<std::string> keys;
+        {
+          std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(normalize_param_.mean_img.c_str(),
+                                                                "r"));
+          NDArray::Load(fi.get(), &data, &keys);
+        }
+        CHECK_EQ(data.size(), 1)
+          << "Invalid mean image file format";
+        data[0].WaitToRead();
+        mshadow::Tensor<cpu, 3> src = data[0].data().get<cpu, 3, real_t>();
+        meanimg_.Resize(src.shape_);
+        mshadow::Copy(meanimg_, src);
+        meanfile_ready_ = true;
+      }
+    }
+  }
+#else
+  LOG(FATAL) << "ImageRec need opencv to process";
+#endif
+}
+
+template<typename DType>
+inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
+  if (overflow)
+    return false;
+  CHECK(source_ != nullptr);
+  dmlc::InputSplit::Blob chunk;
+  unsigned current_size = 0;
+  out->index.resize(batch_param_.batch_size);
+  while (current_size < batch_param_.batch_size) {
+    int n_to_copy;
+    if (n_parsed_ == 0) {
+      if (source_->NextChunk(&chunk)) {
+        inst_order_.clear();
+        inst_index_ = 0;
+        ParseChunk(&chunk);
+        unsigned n_read = 0;
+        for (unsigned i = 0; i < temp_.size(); ++i) {
+          const InstVector<DType>& tmp = temp_[i];
+          for (unsigned j = 0; j < tmp.Size(); ++j) {
+            inst_order_.push_back(std::make_pair(i, j));
+          }
+          n_read += tmp.Size();
+        }
+        n_to_copy = std::min(n_read, batch_param_.batch_size - current_size);
+        n_parsed_ = n_read - n_to_copy;
+        // shuffle instance order if needed
+        if (record_param_.shuffle != 0) {
+          std::shuffle(inst_order_.begin(), inst_order_.end(), rnd_);
+        }
+      } else {
+        if (current_size == 0) return false;
+        CHECK(!overflow) << "number of input images must be bigger than the batch size";
+        if (batch_param_.round_batch != 0) {
+          overflow = true;
+          source_->BeforeFirst();
+        } else {
+          current_size = batch_param_.batch_size;
+        }
+        out->num_batch_padd = batch_param_.batch_size - current_size;
+        n_to_copy = 0;
+      }
+    } else {
+      n_to_copy = std::min(n_parsed_, batch_param_.batch_size - current_size);
+      n_parsed_ -= n_to_copy;
+    }
+
+    // InitBatch
+    if (out->data.size() == 0 && n_to_copy != 0) {
+      std::pair<unsigned, unsigned> place = inst_order_[inst_index_];
+      const DataInst& first_batch = temp_[place.first][place.second];
+      out->data.resize(first_batch.data.size());
+      unit_size_.resize(first_batch.data.size());
+      for (size_t i = 0; i < out->data.size(); ++i) {
+        TShape src_shape = first_batch.data[i].shape_;
+        int src_type_flag = first_batch.data[i].type_flag_;
+        // init object attributes
+        std::vector<index_t> shape_vec;
+        shape_vec.push_back(batch_param_.batch_size);
+        for (index_t dim = 0; dim < src_shape.ndim(); ++dim) {
+          shape_vec.push_back(src_shape[dim]);
+        }
+        TShape dst_shape(shape_vec.begin(), shape_vec.end());
+        auto dtype = prefetch_param_.dtype
+          ? prefetch_param_.dtype.value()
+          : first_batch.data[i].type_flag_;
+        out->data.at(i) = NDArray(dst_shape, Context::CPU(), false , src_type_flag);
+        unit_size_[i] = src_shape.Size();
+      }
+    }
+
+    // Copy
+    #pragma omp parallel for num_threads(param_.preprocess_threads)
+    for (int i = 0; i < n_to_copy; ++i) {
+      std::pair<unsigned, unsigned> place = inst_order_[inst_index_ + i];
+      const DataInst& batch = temp_[place.first][place.second];
+      for (unsigned j = 0; j < batch.data.size(); ++j) {
+        CHECK_EQ(unit_size_[j], batch.data[j].Size());
+        MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
+        mshadow::Copy(
+            out->data[j].data().FlatTo1D<cpu, dtype>().Slice((current_size + i) * unit_size_[j],
+              (current_size + i + 1) * unit_size_[j]),
+            batch.data[j].get_with_shape<cpu, 1, dtype>(mshadow::Shape1(unit_size_[j])));
+        });
+      }
+    }
+    inst_index_ += n_to_copy;
+    current_size += n_to_copy;
+  }
+  return true;
+}
+
+template<typename DType>
+inline void ImageRecordIOParser2<DType>::ParseChunk(dmlc::InputSplit::Blob * chunk) {
+  temp_.resize(param_.preprocess_threads);
+#if MXNET_USE_OPENCV
+  // save opencv out
+  #pragma omp parallel num_threads(param_.preprocess_threads)
+  {
+    CHECK(omp_get_num_threads() == param_.preprocess_threads);
+    int tid = omp_get_thread_num();
+    dmlc::RecordIOChunkReader reader(*chunk, tid, param_.preprocess_threads);
+    ImageRecordIO rec;
+    dmlc::InputSplit::Blob blob;
+    // image data
+    InstVector<DType> &out = temp_[tid];
+    out.Clear();
+    while (reader.NextRecord(&blob)) {
+      // Opencv decode and augments
+      cv::Mat res;
+      rec.Load(blob.dptr, blob.size);
+      cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
+      switch (param_.data_shape[0]) {
+       case 1:
+        res = cv::imdecode(buf, 0);
+        break;
+       case 3:
+        res = cv::imdecode(buf, 1);
+        break;
+       case 4:
+        // -1 to keep the number of channel of the encoded image, and not force gray or color.
+        res = cv::imdecode(buf, -1);
+        CHECK_EQ(res.channels(), 4)
+          << "Invalid image with index " << rec.image_index()
+          << ". Expected 4 channels, got " << res.channels();
+        break;
+       default:
+        LOG(FATAL) << "Invalid output shape " << param_.data_shape;
+      }
+      const int n_channels = res.channels();
+      for (auto& aug : augmenters_[tid]) {
+        res = aug->Process(res, nullptr, prnds_[tid].get());
+      }
+      out.Push(static_cast<unsigned>(rec.image_index()),
+               mshadow::Shape3(n_channels, res.rows, res.cols),
+               mshadow::Shape1(param_.label_width));
+
+      mshadow::Tensor<cpu, 3, DType> data = out.data().Back();
+
+      // For RGB or RGBA data, swap the B and R channel:
+      // OpenCV store as BGR (or BGRA) and we want RGB (or RGBA)
+      std::vector<int> swap_indices;
+      if (n_channels == 1) swap_indices = {0};
+      if (n_channels == 3) swap_indices = {2, 1, 0};
+      if (n_channels == 4) swap_indices = {2, 1, 0, 3};
+
+      std::uniform_real_distribution<float> rand_uniform(0, 1);
+      std::bernoulli_distribution coin_flip(0.5);
+      bool is_mirrored = (normalize_param_.rand_mirror && coin_flip(*(prnds_[tid])))
+                         || normalize_param_.mirror;
+      float contrast_scaled;
+      float illumination_scaled;
+      if (!std::is_same<DType, uint8_t>::value) {
+        contrast_scaled =
+          (rand_uniform(*(prnds_[tid])) * normalize_param_.max_random_contrast * 2
+          - normalize_param_.max_random_contrast + 1)*normalize_param_.scale;
+        illumination_scaled =
+          (rand_uniform(*(prnds_[tid])) * normalize_param_.max_random_illumination * 2
+          - normalize_param_.max_random_illumination) * normalize_param_.scale;
+      }
+      for (int i = 0; i < res.rows; ++i) {
+        uchar* im_data = res.ptr<uchar>(i);
+        for (int j = 0; j < res.cols; ++j) {
+          DType RGBA[4];
+          for (int k = 0; k < n_channels; ++k) {
+            RGBA[k] = im_data[swap_indices[k]];
+          }
+          if (!std::is_same<DType, uint8_t>::value) {
+            // normalize/mirror here to avoid memory copies
+            // logic from iter_normalize.h, function SetOutImg
+
+            if (normalize_param_.mean_r > 0.0f || normalize_param_.mean_g > 0.0f ||
+                normalize_param_.mean_b > 0.0f || normalize_param_.mean_a > 0.0f) {
+              // subtract mean per channel
+              RGBA[0] -= normalize_param_.mean_r;
+              if (n_channels >= 3) {
+                RGBA[1] -= normalize_param_.mean_g;
+                RGBA[2] -= normalize_param_.mean_b;
+              }
+              if (n_channels == 4) {
+                RGBA[3] -= normalize_param_.mean_a;
+              }
+              for (int k = 0; k < n_channels; ++k) {
+                RGBA[k] = RGBA[k] * contrast_scaled + illumination_scaled;
+              }
+            } else if (!meanfile_ready_ || normalize_param_.mean_img.length() == 0) {
+              // do not subtract anything
+              for (int k = 0; k < n_channels; ++k) {
+                RGBA[k] = RGBA[k] * normalize_param_.scale;
+              }
+            } else {
+              CHECK(meanfile_ready_);
+              for (int k = 0; k < n_channels; ++k) {
+                  RGBA[k] = (RGBA[k] - meanimg_[k][i][j]) * contrast_scaled + illumination_scaled;
+              }
+            }
+          }
+          for (int k = 0; k < n_channels; ++k) {
+            if (!std::is_same<DType, uint8_t>::value) {
+              // normalize/mirror here to avoid memory copies
+              // logic from iter_normalize.h, function SetOutImg
+              if (is_mirrored) {
+                data[k][i][res.cols - j - 1] = RGBA[k];
+              } else {
+                data[k][i][j] = RGBA[k];
+              }
+            } else {
+              // do not do normalization in Uint8 reader
+              data[k][i][j] = RGBA[k];
+            }
+          }
+          im_data += n_channels;
+        }
+      }
+
+      mshadow::Tensor<cpu, 1> label = out.label().Back();
+      if (label_map_ != nullptr) {
+        mshadow::Copy(label, label_map_->Find(rec.image_index()));
+      } else if (rec.label != NULL) {
+        CHECK_EQ(param_.label_width, rec.num_label)
+          << "rec file provide " << rec.num_label << "-dimensional label "
+             "but label_width is set to " << param_.label_width;
+        mshadow::Copy(label, mshadow::Tensor<cpu, 1>(rec.label,
+                                                     mshadow::Shape1(rec.num_label)));
+      } else {
+        CHECK_EQ(param_.label_width, 1)
+          << "label_width must be 1 unless an imglist is provided "
+             "or the rec file is packed with multi dimensional label";
+        label[0] = rec.header.label;
+      }
+      res.release();
+    }
+  }
+#else
+      LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+#endif
+}
+
+// create mean image.
+template<typename DType>
+inline void ImageRecordIOParser2<DType>::CreateMeanImg(void) {
+    if (param_.verbose) {
+      LOG(INFO) << "Cannot find " << normalize_param_.mean_img
+                << ": create mean image, this will take some time...";
+    }
+    double start = dmlc::GetTime();
+    dmlc::InputSplit::Blob chunk;
+    size_t imcnt = 0;  // NOLINT(*)
+    while (source_->NextChunk(&chunk)) {
+      ParseChunk(&chunk);
+      inst_order_.clear();
+      for (unsigned i = 0; i < temp_.size(); ++i) {
+        const InstVector<DType>& tmp = temp_[i];
+        for (unsigned j = 0; j < tmp.Size(); ++j) {
+          inst_order_.push_back(std::make_pair(i, j));
+        }
+      }
+      for (unsigned i = 0; i < inst_order_.size(); ++i) {
+        std::pair<unsigned, unsigned> place = inst_order_[i];
+        mshadow::Tensor<cpu, 3> outimg =
+          temp_[place.first][place.second].data[0].template get<cpu, 3, real_t>();
+        if (imcnt == 0) {
+          meanimg_.Resize(outimg.shape_);
+          mshadow::Copy(meanimg_, outimg);
+        } else {
+          meanimg_ += outimg;
+        }
+        imcnt += 1;
+        double elapsed = dmlc::GetTime() - start;
+        if (imcnt % 10000L == 0 && param_.verbose) {
+          LOG(INFO) << imcnt << " images processed, " << elapsed << " sec elapsed";
+        }
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+    // save as mxnet python compatible format.
+    TBlob tmp = meanimg_;
+    {
+      std::unique_ptr<dmlc::Stream> fo(
+          dmlc::Stream::Create(normalize_param_.mean_img.c_str(), "w"));
+      NDArray::Save(fo.get(),
+                    {NDArray(tmp, 0)},
+                    {"mean_img"});
+    }
+    if (param_.verbose) {
+      LOG(INFO) << "Save mean image to " << normalize_param_.mean_img << "..";
+    }
+    meanfile_ready_ = true;
+    this->BeforeFirst();
+}
+
+template<typename DType = real_t>
+class ImageRecordIter2 : public IIterator<DataBatch> {
+ public:
+    ImageRecordIter2() : out_(nullptr) { }
+
+    virtual ~ImageRecordIter2(void) {
+      iter_.Destroy();
+    }
+
+    virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+      prefetch_param_.InitAllowUnknown(kwargs);
+      parser_.Init(kwargs);
+      // maximum prefetch threaded iter internal size
+      const int kMaxPrefetchBuffer = 16;
+      // init thread iter
+      iter_.set_max_capacity(kMaxPrefetchBuffer);
+      // init thread iter
+      iter_.Init([this](DataBatch **dptr) {
+          if (*dptr == nullptr) {
+            *dptr = new DataBatch();
+          }
+          return parser_.ParseNext(*dptr);
+          },
+          [this]() { parser_.BeforeFirst(); });
+    }
+
+    virtual void BeforeFirst(void) {
+      iter_.BeforeFirst();
+    }
+
+    // From iter_prefetcher.h
+    virtual bool Next(void) {
+      if (out_ != nullptr) {
+        recycle_queue_.push(out_); out_ = nullptr;
+      }
+      // do recycle
+      if (recycle_queue_.size() == prefetch_param_.prefetch_buffer) {
+        DataBatch *old_batch =  recycle_queue_.front();
+        // can be more efficient on engine
+        for (NDArray& arr : old_batch->data) {
+          arr.WaitToWrite();
+        }
+        recycle_queue_.pop();
+        iter_.Recycle(&old_batch);
+      }
+      return iter_.Next(&out_);
+    }
+
+    virtual const DataBatch &Value(void) const {
+      return *out_;
+    }
+
+ private:
+    /*! \brief Backend thread */
+    dmlc::ThreadedIter<DataBatch> iter_;
+    /*! \brief Parameters */
+    PrefetcherParam prefetch_param_;
+    /*! \brief output data */
+    DataBatch *out_;
+    /*! \brief queue to be recycled */
+    std::queue<DataBatch*> recycle_queue_;
+    /* \brief parser */
+    ImageRecordIOParser2<DType> parser_;
+};
+
+MXNET_REGISTER_IO_ITER(ImageRecordIter)
+.describe(R"code(Iterating on image RecordIO files
+
+Read images batches from RecordIO files with a rich of data augmentation
+options.
+
+One can use ``tools/im2rec.py`` to pack individual image files into RecordIO
+files.
+
+)code" ADD_FILELINE)
+.add_arguments(ImageRecParserParam::__FIELDS__())
+.add_arguments(ImageRecordParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.add_arguments(ListDefaultAugParams())
+.add_arguments(ImageNormalizeParam::__FIELDS__())
+.set_body([]() {
+    return new ImageRecordIter2<real_t>();
+    });
+
+MXNET_REGISTER_IO_ITER(ImageRecordUInt8Iter)
+.describe(R"code(Iterating on image RecordIO files
+
+This iterator is identical to ``ImageRecordIter`` except for using ``uint8`` as
+the data type instead of ``float``.
+
+)code" ADD_FILELINE)
+.add_arguments(ImageRecParserParam::__FIELDS__())
+.add_arguments(ImageRecordParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.add_arguments(ListDefaultAugParams())
+.set_body([]() {
+    return new ImageRecordIter2<uint8_t>();
+  });
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index cb2e2a853e0d..09799f2b1991 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -239,7 +239,11 @@ class MNISTIter: public IIterator<TBlobBatch> {
 DMLC_REGISTER_PARAMETER(MNISTParam);
 
 MXNET_REGISTER_IO_ITER(MNISTIter)
-.describe("Create iterator for MNIST hand-written digit number recognition dataset.")
+.describe(R"code(Iterating on the MNIST dataset.
+
+One can download the dataset from http://yann.lecun.com/exdb/mnist/
+
+)code" ADD_FILELINE)
 .add_arguments(MNISTParam::__FIELDS__())
 .add_arguments(PrefetcherParam::__FIELDS__())
 .set_body([]() {
diff --git a/src/io/iter_normalize.h b/src/io/iter_normalize.h
index 65eac56380ba..2cebaaa3a48e 100644
--- a/src/io/iter_normalize.h
+++ b/src/io/iter_normalize.h
@@ -17,65 +17,11 @@
 #include <string>
 #include <vector>
 #include "../common/utils.h"
+#include "./image_iter_common.h"
 
 namespace mxnet {
 namespace io {
 
-// normalize parameters
-struct ImageNormalizeParam :  public dmlc::Parameter<ImageNormalizeParam> {
-  /*! \brief random seed */
-  int seed;
-  /*! \brief whether to mirror the image */
-  bool mirror;
-  /*! \brief whether to perform rand mirror the image */
-  bool rand_mirror;
-  /*! \brief mean file string */
-  std::string mean_img;
-  /*! \brief mean value for r channel */
-  float mean_r;
-  /*! \brief mean value for g channel */
-  float mean_g;
-  /*! \brief mean value for b channel */
-  float mean_b;
-  /*! \brief mean value for alpha channel */
-  float mean_a;
-  /*! \brief scale on color space */
-  float scale;
-  /*! \brief maximum ratio of contrast variation */
-  float max_random_contrast;
-  /*! \brief maximum value of illumination variation */
-  float max_random_illumination;
-  /*! \brief silent */
-  bool verbose;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(ImageNormalizeParam) {
-    DMLC_DECLARE_FIELD(seed).set_default(0)
-        .describe("Augmentation Param: Random Seed.");
-    DMLC_DECLARE_FIELD(mirror).set_default(false)
-        .describe("Augmentation Param: Whether to mirror the image.");
-    DMLC_DECLARE_FIELD(rand_mirror).set_default(false)
-        .describe("Augmentation Param: Whether to mirror the image randomly.");
-    DMLC_DECLARE_FIELD(mean_img).set_default("")
-        .describe("Augmentation Param: Mean Image to be subtracted.");
-    DMLC_DECLARE_FIELD(mean_r).set_default(0.0f)
-        .describe("Augmentation Param: Mean value on R channel.");
-    DMLC_DECLARE_FIELD(mean_g).set_default(0.0f)
-        .describe("Augmentation Param: Mean value on G channel.");
-    DMLC_DECLARE_FIELD(mean_b).set_default(0.0f)
-        .describe("Augmentation Param: Mean value on B channel.");
-    DMLC_DECLARE_FIELD(mean_a).set_default(0.0f)
-        .describe("Augmentation Param: Mean value on Alpha channel.");
-    DMLC_DECLARE_FIELD(scale).set_default(1.0f)
-        .describe("Augmentation Param: Scale in color space.");
-    DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
-        .describe("Augmentation Param: Maximum ratio of contrast variation.");
-    DMLC_DECLARE_FIELD(max_random_illumination).set_default(0.0f)
-        .describe("Augmentation Param: Maximum value of illumination variation.");
-    DMLC_DECLARE_FIELD(verbose).set_default(true)
-        .describe("Augmentation Param: Whether to print augmentor info.");
-  }
-};
-
 /*!
  * \brief Iterator that normalize a image.
  *  It also applies a few augmention before normalization.
@@ -109,7 +55,7 @@ class ImageNormalizeIter : public IIterator<DataInst> {
           std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(param_.mean_img.c_str(), "r"));
           NDArray::Load(fi.get(), &data, &keys);
         }
-        CHECK_EQ(data.size(), 1)
+        CHECK_EQ(data.size(), 1U)
             << "Invalid mean image file format";
         data[0].WaitToRead();
         mshadow::Tensor<cpu, 3> src = data[0].data().get<cpu, 3, real_t>();
@@ -247,6 +193,173 @@ class ImageNormalizeIter : public IIterator<DataInst> {
     this->BeforeFirst();
   }
 };
+
+/*!
+ * \brief Iterator that normalize a image.
+ *  It also applies a few augmention before normalization.
+ */
+class ImageDetNormalizeIter : public IIterator<DataInst> {
+ public:
+  explicit ImageDetNormalizeIter(IIterator<DataInst> *base)
+      : base_(base), meanfile_ready_(false) {
+  }
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    base_->Init(kwargs);
+    rnd_.seed(kRandMagic + param_.seed);
+    outimg_.set_pad(false);
+    meanimg_.set_pad(false);
+    if (param_.mean_img.length() != 0) {
+      std::unique_ptr<dmlc::Stream> fi(
+          dmlc::Stream::Create(param_.mean_img.c_str(), "r", true));
+      if (fi.get() == nullptr) {
+        this->CreateMeanImg();
+      } else {
+        fi.reset(nullptr);
+        if (param_.verbose) {
+          LOG(INFO) << "Load mean image from " << param_.mean_img;
+        }
+        // use python compatible ndarray store format
+        std::vector<NDArray> data;
+        std::vector<std::string> keys;
+        {
+          std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(param_.mean_img.c_str(), "r"));
+          NDArray::Load(fi.get(), &data, &keys);
+        }
+        CHECK_EQ(data.size(), 1)
+            << "Invalid mean image file format";
+        data[0].WaitToRead();
+        mshadow::Tensor<cpu, 3> src = data[0].data().get<cpu, 3, real_t>();
+        meanimg_.Resize(src.shape_);
+        mshadow::Copy(meanimg_, src);
+        meanfile_ready_ = true;
+      }
+    }
+  }
+
+  virtual void BeforeFirst(void) {
+    base_->BeforeFirst();
+  }
+
+  virtual const DataInst& Value(void) const {
+    return out_;
+  }
+
+  virtual bool Next(void) {
+    if (!this->Next_()) return false;
+    return true;
+  }
+
+ private:
+  /*! \brief base iterator */
+  std::unique_ptr<IIterator<DataInst> > base_;
+  // whether mean image is ready.
+  bool meanfile_ready_;
+  /*! \brief output data */
+  DataInst out_;
+  // normalize parameter.
+  ImageDetNormalizeParam param_;
+  /*! \brief mean image, if needed */
+  mshadow::TensorContainer<cpu, 3> meanimg_;
+  /*! \brief temp space for output image */
+  mshadow::TensorContainer<cpu, 3> outimg_;
+  /*! \brief random numeber engine */
+  common::RANDOM_ENGINE rnd_;
+  // random magic number of this iterator
+  static const int kRandMagic = 0;
+
+  /*! \brief internal next function, inlined for fater processing. */
+  inline bool Next_(void) {
+    if (!base_->Next()) return false;
+    const DataInst &src = base_->Value();
+    this->SetOutImg(src);
+    out_.data.resize(2);
+    out_.data[0] = outimg_;
+    out_.data[1] = src.data[1];
+    out_.index = src.index;
+    out_.extra_data = src.extra_data;
+    return true;
+  }
+  /*!
+   * \brief Set the output image, after augmentation and normalization.
+   * \param src The source image.
+   */
+  inline void SetOutImg(const DataInst &src) {
+    using namespace mshadow::expr;  // NOLINT(*)
+    mshadow::Tensor<cpu, 3> data = src.data[0].get<cpu, 3, real_t>();
+
+    outimg_.Resize(data.shape_);
+
+    if (param_.mean_r > 0.0f || param_.mean_g > 0.0f ||
+        param_.mean_b > 0.0f || param_.mean_a > 0.0f) {
+      // subtract mean per channel
+      data[0] -= param_.mean_r;
+      if (data.shape_[0] >= 3) {
+        data[1] -= param_.mean_g;
+        data[2] -= param_.mean_b;
+      }
+      if (data.shape_[0] == 4) {
+        data[3] -= param_.mean_a;
+      }
+    } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
+      // do not subtract anything
+    } else {
+      CHECK(meanfile_ready_);
+      data -= meanimg_;
+    }
+
+    // std
+    if (param_.std_r > 0.0f) {
+      data[0] /= param_.std_r;
+    }
+    if (data.shape_[0] >= 3 && param_.std_g > 0.0f) {
+      data[1] /= param_.std_g;
+    }
+    if (data.shape_[0] >= 3 && param_.std_b > 0.0f) {
+      data[2] /= param_.std_b;
+    }
+    if (data.shape_[0] == 4 && param_.std_a > 0.0f) {
+      data[3] /= param_.std_a;
+    }
+    outimg_ = data * param_.scale;
+  }
+
+  // creat mean image.
+  inline void CreateMeanImg(void) {
+    if (param_.verbose) {
+      LOG(INFO) << "Cannot find " << param_.mean_img
+                << ": create mean image, this will take some time...";
+    }
+    double start = dmlc::GetTime();
+    size_t imcnt = 1;  // NOLINT(*)
+    CHECK(this->Next_()) << "input iterator failed.";
+    meanimg_.Resize(outimg_.shape_);
+    mshadow::Copy(meanimg_, outimg_);
+    while (this->Next_()) {
+      meanimg_ += outimg_;
+      imcnt += 1;
+      double elapsed = dmlc::GetTime() - start;
+      if (imcnt % 10000L == 0 && param_.verbose) {
+        LOG(INFO) << imcnt << " images processed, " << elapsed << " sec elapsed";
+      }
+    }
+    meanimg_ *= (1.0f / imcnt);
+    // save as mxnet python compatible format.
+    TBlob tmp = meanimg_;
+    {
+      std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(param_.mean_img.c_str(), "w"));
+      NDArray::Save(fo.get(),
+                    {NDArray(tmp, 0)},
+                    {"mean_img"});
+    }
+    if (param_.verbose) {
+      LOG(INFO) << "Save mean image to " << param_.mean_img << "..";
+    }
+    meanfile_ready_ = true;
+    this->BeforeFirst();
+  }
+};
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_ITER_NORMALIZE_H_
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index f6c792d04e05..9050ef2d1b38 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -20,32 +20,10 @@
 #include <queue>
 #include <algorithm>
 #include "./inst_vector.h"
+#include "./image_iter_common.h"
 
 namespace mxnet {
 namespace io {
-// Define prefetcher parameters
-struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
-  /*! \brief number of prefetched batches */
-  size_t prefetch_buffer;
-  /*! \brief data type */
-  dmlc::optional<int> dtype;
-
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(PrefetcherParam) {
-    DMLC_DECLARE_FIELD(prefetch_buffer).set_default(4)
-        .describe("Backend Param: Number of prefetched parameters");
-    DMLC_DECLARE_FIELD(dtype)
-      .add_enum("float32", mshadow::kFloat32)
-      .add_enum("float64", mshadow::kFloat64)
-      .add_enum("float16", mshadow::kFloat16)
-      .add_enum("int32", mshadow::kInt32)
-      .add_enum("uint8", mshadow::kUint8)
-      .set_default(dmlc::optional<int>())
-      .describe("Output data type. Leave as None to use"
-                "internal data iterator's output type");
-  }
-};
-
 // iterator on image recordio
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 2c89d3bc613a..da0c0df4ca34 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -800,7 +800,7 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
   if (ret->shape().ndim() == 3) {
     buff = ret->Reshape(mshadow::Shape4(1, ret->shape()[0], ret->shape()[1], ret->shape()[2]));
   } else {
-    CHECK_EQ(ret->shape().ndim(), 4);
+    CHECK_EQ(ret->shape().ndim(), 4U);
     buff = ret->Slice(index, index+1);
   }
   CHECK_EQ(buff.ctx().dev_mask(), cpu::kDevMask);
diff --git a/src/nnvm/legacy_json_util.cc b/src/nnvm/legacy_json_util.cc
index f02c45c6b437..bdd983cd3a67 100644
--- a/src/nnvm/legacy_json_util.cc
+++ b/src/nnvm/legacy_json_util.cc
@@ -152,11 +152,25 @@ Graph UpgradeJSON_000903_000904(Graph g) {
   return g;
 }
 
+// ReduceAxisParam: int axis -> optional<int> axis
+Graph UpgradeJSON_000904_000905(Graph g) {
+  nnvm::DFSVisit(g.outputs, [](const std::shared_ptr<Node>& n) {
+      if (n->op() == nullptr) return;
+      if (n->op()->name != "argmin" && n->op()->name != "argmax") return;
+      if (n->attrs.dict.find("axis") == n->attrs.dict.end() || n->attrs.dict["axis"] != "-1")
+        return;
+      n->attrs.dict.erase("axis");
+      n->op()->attr_parser(&(n->attrs));
+    });
+  return g;
+}
+
 static std::vector<std::pair<int, std::function<Graph(Graph)> > > upgrader_list = {
   {MXNET_VERSION, UpgradeJSON_FixParsing},
   {MXNET_MAKE_VERSION(100, 0, 0), UpgradeJSON_Parse},
   {MXNET_MAKE_VERSION(0, 9, 0), UpgradeJSON_000800_000900},
   {MXNET_MAKE_VERSION(0, 9, 4), UpgradeJSON_000903_000904},
+  {MXNET_MAKE_VERSION(0, 9, 5), UpgradeJSON_000904_000905},
 };
 
 Graph LoadLegacyJSONPass(Graph g) {
@@ -166,6 +180,7 @@ Graph LoadLegacyJSONPass(Graph g) {
   if (load.attrs.find("mxnet_version") != load.attrs.end()) {
     version = nnvm::get<int>(*load.attrs["mxnet_version"]);
   }
+  bool upgrading = false;
   if (version > MXNET_VERSION) {
     LOG(INFO) << "Warning: loading symbol saved by MXNet version " << version
               << " with lower version of MXNet v" << MXNET_VERSION
@@ -175,10 +190,12 @@ Graph LoadLegacyJSONPass(Graph g) {
     LOG(INFO) << "Loading symbol saved by previous version v"
               << version/10000 << "." << (version/100)%100 << "." << version%100
               << ". Attempting to upgrade...";
+    upgrading = true;
   }
   for (auto it = upgrader_list.begin(); it != upgrader_list.end(); ++it) {
     if (it->first > version) load = it->second(load);
   }
+  if (upgrading) LOG(INFO) << "Symbol successfully upgraded!";
   return load;
 }
 
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index cc5d17b8c853..f3a23a641fca 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -54,8 +54,8 @@ class ActivationOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> data = in_data[activation::kData].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
@@ -71,9 +71,9 @@ class ActivationOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(req.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> m_out_grad = out_grad[activation::kOut].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
@@ -101,7 +101,7 @@ class ActivationProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     const TShape &dshape = in_shape->at(activation::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
@@ -112,7 +112,7 @@ class ActivationProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
+    CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index 85723555c5a6..8d66eb61a956 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -65,21 +65,17 @@ Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_
 DMLC_REGISTER_PARAMETER(ActivationParam);
 
 MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp)
-.describe(R"(Elementwise activation function.
-
-The following activation types are supported (operations are applied elementwisely to each
-scalar of the input tensor):
+.describe(R"code(Elementwise activation function.
+The activation operations are applied elementwisely to each array elements. The
+following types are supported:
 
 - `relu`: Rectified Linear Unit, `y = max(x, 0)`
 - `sigmoid`: `y = 1 / (1 + exp(-x))`
 - `tanh`: Hyperbolic tangent, `y = (exp(x) - exp(-x)) / (exp(x) + exp(-x))`
 - `softrelu`: Soft ReLU, or SoftPlus, `y = log(1 + exp(x))`
-
-See `LeakyReLU` for other activations with parameters.
-)")
-.add_argument("data", "Symbol", "Input data to activation function.")
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to activation function.")
 .add_arguments(ActivationParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
old mode 100644
new mode 100755
index 31f6531dd49c..01a478429ce9
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -62,14 +62,14 @@ class BatchNormOp : public Operator {
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(aux_states.size(), 2);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(aux_states.size(), 2U);
     if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 3);
-      CHECK_EQ(req.size(), 3);
+      CHECK_EQ(out_data.size(), 3U);
+      CHECK_EQ(req.size(), 3U);
     } else {
-      CHECK_GE(out_data.size(), 1);
-      CHECK_GE(req.size(), 1);
+      CHECK_GE(out_data.size(), 1U);
+      CHECK_GE(req.size(), 1U);
       CHECK_EQ(req[batchnorm::kOut], kWriteTo);
     }
 
@@ -126,10 +126,10 @@ class BatchNormOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3 : 1);
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(out_data.size(), 3);
-    CHECK_EQ(in_grad.size(), 3);
+    CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(out_data.size(), 3U);
+    CHECK_EQ(in_grad.size(), 3U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data, grad, grad_in;
     const real_t scale = static_cast<real_t>(out_grad[batchnorm::kOut].shape_[1]) /
@@ -235,7 +235,7 @@ class BatchNormProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3) << "Input:[data, gamma, beta]";
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     in_shape->at(1) = TShape(Shape1(dshape[1]));
@@ -251,6 +251,43 @@ class BatchNormProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    using namespace mshadow;
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    // For float16 input type beta, gamma, mean, and average are stored in float32.
+    // For other input types, these parameters have the same type as input
+    // NOTE: This requirement is from cuDNN (v. 4 and 5)
+    int dtype_param = (dtype == kFloat16) ? kFloat32 : dtype;
+    for (index_t i = 1; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype_param;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype_param) << "This layer requires uniform type. "
+                                             << "Expected " << dtype_param << " v.s. given "
+                                             << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    for (index_t i = 0; i < aux_type->size(); ++i) {
+      if ((*aux_type)[i] != -1) {
+        CHECK_EQ((*aux_type)[i], dtype_param) << "This layer requires uniform type. "
+                                              << "Expected " << dtype_param << " v.s. given "
+                                              << (*aux_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    int n_aux = this->ListAuxiliaryStates().size();
+    aux_type->clear();
+    for (int i = 0; i < n_aux; ++i ) aux_type->push_back(dtype_param);
+    int n_out = this->ListOutputs().size();
+    out_type->clear();
+    out_type->push_back(dtype);
+    for (int i = 1; i < n_out; ++i ) out_type->push_back(dtype_param);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new BatchNormProp();
     ptr->param_ = param_;
diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc
index c943f2170a7f..0fb6bf7fe628 100644
--- a/src/operator/batch_norm.cc
+++ b/src/operator/batch_norm.cc
@@ -18,12 +18,7 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(BatchNormParam param, int dtype) {
 #if MXNET_USE_MKL2017 == 1
-  if (!param.use_global_stats) {
-    return new MKLBatchNormOp<cpu, float>(param);
-  } else {
-    if (enableMKLWarnGenerated())
-      LOG(INFO) << MKLBatchNormOp<cpu, float>::getName() << " Skip MKL optimization";
-  }
+  return new MKLBatchNormOp<cpu, float>(param);
 #endif
   return new BatchNormOp<cpu>(param);
 }
@@ -41,10 +36,50 @@ Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_s
 DMLC_REGISTER_PARAMETER(BatchNormParam);
 
 MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp)
-.describe("Apply batch normalization to input.")
-.add_argument("data", "Symbol", "Input data to batch normalization")
-.add_argument("gamma", "Symbol", "gamma matrix")
-.add_argument("beta", "Symbol", "beta matrix")
+.describe(R"code(Batch normalization.
+
+Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as
+well as offset ``beta``.
+
+Assume the input has more than one dimension and we normalize along axis 1.
+We first compute the mean and variance along this axis:
+
+.. math::
+
+  data\_mean[i] = mean(data[:,i,:,...]) \\
+  data\_var[i] = var(data[:,i,:,...])
+
+Then compute the normalized output, which has the same shape as input, as following:
+
+.. math::
+
+  out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}} * gamma[i] + beta[i]
+
+Both *mean* and *var* returns a scalar by treating the input as a vector.
+
+Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta``
+have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and
+``data_var`` as well, which are needed for the backward pass.
+
+Besides the inputs and the outputs, this operator accepts two auxiliary
+states, ``moving_mean`` and ``moving_var``, which are *k*-length
+vectors. They are global statistics for the whole dataset, which are updated
+by::
+
+  moving_mean = moving_mean * momentum + data_mean * (1 - momentum)
+  moving_var = moving_var * momentum + data_var * (1 - momentum)
+
+If ``use_global_stats`` is set to be true, then ``moving_mean`` and
+``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute
+the output. It is often used during inference.
+
+Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true,
+then set ``gamma`` to 1 and its gradient to 0.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to batch normalization")
+.add_argument("gamma", "ndarray-or-symbol", "gamma array")
+.add_argument("beta", "ndarray-or-symbol", "beta array")
 .add_arguments(BatchNormParam::__FIELDS__());
 
 NNVM_REGISTER_OP(BatchNorm)
@@ -60,4 +95,3 @@ NNVM_REGISTER_OP(BatchNorm)
 
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu
old mode 100644
new mode 100755
index c0e2e2ae0843..d24d373fd725
--- a/src/operator/batch_norm.cu
+++ b/src/operator/batch_norm.cu
@@ -12,15 +12,19 @@ namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<gpu>(BatchNormParam param, int dtype) {
+  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
   if (!param.use_global_stats) {
-    return new CuDNNBatchNormOp(param);
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new CuDNNBatchNormOp<DType>(param);
+    })
   } else {
-    return new BatchNormOp<gpu>(param);
+    op = new BatchNormOp<gpu>(param);
   }
 #else
-  return new BatchNormOp<gpu>(param);
+  op = new BatchNormOp<gpu>(param);
 #endif
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/bilinear_sampler-inl.h b/src/operator/bilinear_sampler-inl.h
index a8623bfd4bf2..b4c9d991865f 100644
--- a/src/operator/bilinear_sampler-inl.h
+++ b/src/operator/bilinear_sampler-inl.h
@@ -44,7 +44,7 @@ class BilinearSamplerOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[bs::kOut], kWriteTo);
-    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(in_data.size(), 2U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     Tensor<xpu, 4, DType> data = in_data[bs::kData].get<xpu, 4, DType>(s);
@@ -63,7 +63,7 @@ class BilinearSamplerOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2);
+    CHECK_EQ(in_data.size(), 2U);
     CHECK_NE(req[bs::kData], kWriteInplace);
     CHECK_NE(req[bs::kGrid], kWriteInplace);
     Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -127,22 +127,22 @@ class BilinearSamplerProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data, grid]";
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, grid]";
     const TShape &dshape = (*in_shape)[bs::kData];
     const TShape &lshape = (*in_shape)[bs::kGrid];
     if (dshape.ndim() == 0) return false;
-    CHECK_EQ(dshape.ndim(), 4) \
+    CHECK_EQ(dshape.ndim(), 4U) \
         << "input data should be 4D in batch-num_filter-y-x";
     if (lshape.ndim() ==  0) return false;
-    CHECK_EQ(lshape.ndim(), 4) \
+    CHECK_EQ(lshape.ndim(), 4U) \
       << "Sampler grid should be 4D in batch-2-y-x";
     CHECK_EQ(dshape[0], lshape[0]);
-    CHECK_EQ(lshape[1], 2) << "incorrect grid shape[1], should be 2";
+    CHECK_EQ(lshape[1], 2U) << "incorrect grid shape[1], should be 2";
     // target height
-    CHECK_GT(lshape[2], 0) \
+    CHECK_GT(lshape[2], 0U) \
             << "incorrect grid_shape: " << lshape[2];
     // target width
-    CHECK_GT(lshape[3], 0) \
+    CHECK_GT(lshape[3], 0U) \
         << "incorrect grid_shape: " << lshape[3];
     out_shape->clear();
     // output_shape : (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3])
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index 150f8507743b..09b0c4b21e89 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -50,7 +50,7 @@ class ConcatOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), size_);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     CHECK_LT(dimension_, in_data[concat_enum::kData0].ndim());
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 3, DType> > data(size_);
@@ -82,7 +82,7 @@ class ConcatOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK_EQ(in_grad.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
     std::vector<Tensor<xpu, 3, DType> > grad_in(size_);
@@ -127,7 +127,7 @@ class ConcatProp : public OperatorProperty {
   std::vector<std::string> ListArguments() const override {
     std::vector<std::string> ret;
     for (int i = 0; i < param_.num_args; ++i) {
-      ret.push_back(std::string("arg") + static_cast<char>('0' + i));
+      ret.push_back(std::string("arg") + std::to_string(i));
     }
     return ret;
   }
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
index 7578fa7896b5..74aca47bb6dc 100644
--- a/src/operator/concat.cc
+++ b/src/operator/concat.cc
@@ -18,7 +18,8 @@ template<>
 Operator* CreateOp<cpu>(ConcatParam param, int dtype) {
   Operator *op = NULL;
 #if MXNET_USE_MKL2017 == 1
-  if (1 == param.dim) {
+  if ((1 == param.dim) &&
+    (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) {
     switch (dtype) {
       case mshadow::kFloat32:
       return new MKLConcatOp<cpu, float>(param);
@@ -45,10 +46,33 @@ Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shap
 DMLC_REGISTER_PARAMETER(ConcatParam);
 
 MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp)
-.add_argument("data", "Symbol[]", "List of tensors to concatenate")
+.describe(R"code(Concate a list of array along a given axis.
+
+The dimension sizes of the input arrays on the given axis should be the same.
+
+For example::
+
+  x = [[1,1],[1,1]]
+  y = [[2,2],[2,2]]
+  z = [[3,3],[3,3],[3,3]]
+
+  Concat(x,y,z,dim=0) = [[ 1.,  1.],
+                         [ 1.,  1.],
+                         [ 2.,  2.],
+                         [ 2.,  2.],
+                         [ 3.,  3.],
+                         [ 3.,  3.],
+                         [ 3.,  3.]]
+
+  Concat(x,y,z,dim=1) = [[ 1.,  1.,  2.,  2.],
+                         [ 1.,  1.,  2.,  2.]]
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol[]", "List of tensors to concatenate")
 .add_arguments(ConcatParam::__FIELDS__())
-.set_key_var_num_args("num_args")
-.describe("Perform a feature concat on channel dim (defaut is 1) over all");
+.set_key_var_num_args("num_args");
+
+NNVM_REGISTER_OP(Concat).add_alias("concat");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
new file mode 100644
index 000000000000..566327e3677c
--- /dev/null
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -0,0 +1,224 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file count_sketch-inl.h
+ * \brief count_sketch operator and symbol
+ * \author Chen Zhu
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_COUNT_SKETCH_INL_H_
+#define MXNET_OPERATOR_CONTRIB_COUNT_SKETCH_INL_H_
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+namespace mxnet {
+namespace op {
+//  Declare enumeration of input order to make code more intuitive.
+//  These enums are only visible within this header
+namespace CountSketch {
+enum  CountSketchOpInputs{kData, kH, kS};
+enum  CountSketchOpOutputs{kOut};
+}  //  namespace CountSketch
+
+// seems that we can infer all the parameters from data shapes at the moment
+struct CountSketchParam : public dmlc::Parameter<CountSketchParam> {
+    int out_dim;
+    int processing_batch_size;
+    DMLC_DECLARE_PARAMETER(CountSketchParam) {
+        DMLC_DECLARE_FIELD(out_dim)
+        .describe("The output dimension.");
+        DMLC_DECLARE_FIELD(processing_batch_size).set_default(32)
+        .describe("How many sketch vectors to process at one time.");
+    }
+};
+
+template<typename xpu, typename DType>
+class CountSketchOp : public Operator {
+ public:
+    explicit CountSketchOp(CountSketchParam param) {
+        this->param_ = param;
+    }
+
+    virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+        using namespace mshadow;
+        CHECK_EQ(in_data.size(), 3);
+        CHECK_EQ(out_data.size(), 1);
+        Stream<xpu> *s = ctx.get_stream<xpu>();
+
+        // use FlatTo2D to preseve the possible 4D shape
+        // h and s should be 1d vectors
+        Tensor<xpu, 2, DType> data = in_data[CountSketch::kData].FlatTo2D<xpu, DType>(s);
+
+        const TShape& hshape = in_data[CountSketch::kH].shape_;
+        const TShape& sshape = in_data[CountSketch::kS].shape_;
+        Tensor<xpu, 1, DType> h = in_data[CountSketch::kH].get_with_shape<xpu, 1, DType>(
+            Shape1(hshape.ProdShape(0, hshape.ndim())), s);
+        Tensor<xpu, 1, DType> ss = in_data[CountSketch::kS].get_with_shape<xpu, 1, DType>(
+            Shape1(sshape.ProdShape(0, sshape.ndim())), s);
+        Tensor<xpu, 2, DType> out = out_data[CountSketch::kOut].FlatTo2D<xpu, DType>(s);
+        n_samples = data.shape_[0];
+        in_dim = data.shape_[1];
+    // firstly set out to zero as we will use sum
+    out = 0;
+        CountSketchForward(out, data, h, ss, n_samples,
+                           this->param_.processing_batch_size, in_dim, this->param_.out_dim);
+    }
+
+    virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, DType> ograd = out_grad[CountSketch::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> dgrad = in_grad[CountSketch::kData].FlatTo2D<xpu, DType>(s);
+
+    const TShape& hshape = in_data[CountSketch::kH].shape_;
+    const TShape& sshape = in_data[CountSketch::kS].shape_;
+        Tensor<xpu, 1, DType> h = in_data[CountSketch::kH].get_with_shape<xpu, 1, DType>(
+                                            Shape1(hshape.ProdShape(0, hshape.ndim())), s);
+    Tensor<xpu, 1, DType> ss = in_data[CountSketch::kS].get_with_shape<xpu, 1, DType>(
+                                            Shape1(sshape.ProdShape(0, sshape.ndim())), s);
+
+    CountSketchBackward(dgrad, ograd, h, ss, n_samples,
+            this->param_.processing_batch_size, in_dim, this->param_.out_dim);
+    }
+
+ private:
+    CountSketchParam param_;
+    int n_samples;
+    int in_dim;
+};  // class CountSketchOp
+
+// Declare Factory Function
+template<typename xpu>
+Operator* CreateOp(CountSketchParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class CountSketchProp : public OperatorProperty {
+ public:
+    std::vector<std::string> ListArguments() const override {
+        return {"data", "h", "s"};
+    }
+    std::vector<std::string> ListOutputs() const override {
+    return {"output"};
+  }
+  int NumOutputs() const override {
+    return 1;
+  }
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3) <<"Input:[data, h, s]";
+    const TShape &dshape = (*in_shape)[CountSketch::kData];
+    // require data to be known
+    if (dshape.ndim() == 0) return false;
+
+    out_shape->clear();
+    if (dshape.ndim() == 4) {
+      // check the shapes of h and s
+        CHECK_EQ((*in_shape)[CountSketch::kH][1], dshape[3])
+            << "H should be 2D tensor with same length as input shape[3], "
+                        << (*in_shape)[CountSketch::kH][1] << " v.s. " << dshape[3];
+        CHECK_EQ((*in_shape)[CountSketch::kS][1], dshape[3])
+            << "S should be 2D tensor with same length as input shape[3], "
+                        << (*in_shape)[CountSketch::kS][1] << " v.s. " << dshape[3];
+
+        out_shape->push_back(Shape4(dshape[0], dshape[1], dshape[2], param_.out_dim));
+    } else if (dshape.ndim() == 2) {
+        CHECK_EQ((*in_shape)[CountSketch::kH][1], dshape[1])
+           << "H should be 2D tensor with same length as input shape[1], "
+                        << (*in_shape)[CountSketch::kH][1] << " v.s. " << dshape[1];
+        CHECK_EQ((*in_shape)[CountSketch::kS][1], dshape[1])
+            << "S should be 2D tensor with same length as input shape[1], "
+                        << (*in_shape)[CountSketch::kS][1] << " v.s. " << dshape[1];
+        out_shape->push_back(Shape2(dshape[0], param_.out_dim));
+    } else {
+        CHECK_EQ(dshape.ndim(), 2) <<"Data should be 2D or 4D!";
+    return false;
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    CountSketchProp* cs_sym = new CountSketchProp();
+    cs_sym->param_ = this->param_;
+    return cs_sym;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_count_sketch";
+  }
+
+  // declare dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[CountSketch::kOut], in_data[CountSketch::kData],
+            in_data[CountSketch::kH], in_data[CountSketch::kS]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{in_data[CountSketch::kData], in_grad[CountSketch::kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                              std::vector<int> *in_type) const override;
+
+ private:
+    CountSketchParam param_;
+};
+#endif
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_COUNT_SKETCH_INL_H_
diff --git a/src/operator/contrib/count_sketch.cc b/src/operator/contrib/count_sketch.cc
new file mode 100644
index 000000000000..4335b878d7ae
--- /dev/null
+++ b/src/operator/contrib/count_sketch.cc
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file count_sketch.cc
+ * \brief count_sketch op
+ * \author Chen Zhu
+*/
+#include "./count_sketch-inl.h"
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(CountSketchParam param, int dtype) {
+    LOG(FATAL) << "CountSketch is only available for GPU.";
+    return NULL;
+}
+Operator *CountSketchProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                            std::vector<int> *in_type) const {
+    std::vector<TShape> out_shape, aux_shape;
+    std::vector<int> out_type, aux_type;
+    CHECK(InferType(in_type, &out_type, &aux_type));
+    CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(CountSketchParam);
+MXNET_REGISTER_OP_PROPERTY(_contrib_count_sketch, CountSketchProp)
+.describe("Apply CountSketch to input.")
+.add_argument("data", "Symbol", "Input data to the CountSketchOp.")
+.add_argument("s", "Symbol", "The sign vector")
+.add_argument("h", "Symbol", "The index vector")
+.add_arguments(CountSketchParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu
new file mode 100644
index 000000000000..7cf13e8a4993
--- /dev/null
+++ b/src/operator/contrib/count_sketch.cu
@@ -0,0 +1,174 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file count_sketch.cu
+ * \brief count_sketch op
+ * \author Chen Zhu, Yang Shi
+*/
+#include "./count_sketch-inl.h"
+#include <mshadow/tensor.h>
+#include <stdio.h>
+#include <algorithm>
+
+
+
+#define WARPS_PER_BLOCK 1
+#define THREADS_PER_BLOCK 512
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+namespace mshadow {
+namespace cuda {
+// wrappers to deal with atomic add
+// supporting only single precision
+__device__ void atomic_add(float* dst, float val) {
+  atomicAdd(dst, val);
+}
+
+// for double precision
+__device__ void atomic_add(double* address, double val) {
+  // code example in the official document at:
+  // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
+  // #atomic-functions
+
+  // NOLINT_NEXT_LINE(runtime/int)
+  unsigned long long int* address_as_ull = (unsigned long long int*) address;  // NOLINT(*)
+  unsigned long long int old = *address_as_ull, assumed;                     // NOLINT(*)
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+            __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN
+    // (since NaN != NaN)
+  } while (assumed != old);
+}
+
+template <typename DType>
+__global__ void sketch_forward_kernel(const int nthreads, DType *out, const DType *h,
+                    const DType *s, const DType *in, const int n_smaples,
+                    const int in_dim, const int out_dim) {
+  // input: n_smaples * in_dim
+  // output: n_smaples * out_dim
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index >= nthreads) {
+    return;
+  }
+  // nthreads is the maximum of thread indices, should be equal to in_dim
+  // index is point index
+  const int i_indim = index % in_dim;
+  const int i_sample = index / in_dim;
+
+  // get the target location in the output
+  const int target = i_sample*out_dim + h[i_indim];
+  atomic_add(out + target, s[i_indim] * in[index]);
+}
+
+template <typename DType>
+__global__ void sketch_backward_kernel(const int nthreads, DType *in_grad, const DType *h,
+                    const DType *s, const DType *out_grad, const int n_smaples,
+                    const int in_dim, const int out_dim) {
+  // only calculate gradient regarding x
+  // can also calculate gradient regarding s if needed
+  const int index = blockIdx.x * blockDim.x + threadIdx.x;
+  const int i_indim = index % in_dim;
+  const int i_sample = index / in_dim;
+  const int i_outdim = i_sample*out_dim + h[i_indim];
+  in_grad[index] = out_grad[i_outdim] * s[i_indim];
+}
+
+}  // namespace cuda
+
+// CountSketch Forward
+template <typename DType>
+inline void CountSketchForward(const Tensor<gpu, 2, DType> &out,
+                               const Tensor<gpu, 2, DType> &in,
+                               const Tensor<gpu, 1, DType> &h,
+                               const Tensor<gpu, 1, DType> &s,
+                               const int n_samples,
+                               const int processing_batch_size,
+                               const int in_dim,
+                               const int out_dim) {
+  DType *out_ptr = out.dptr_;
+  const DType *in_ptr = in.dptr_;
+  const DType *h_ptr = h.dptr_;
+  const DType *s_ptr = s.dptr_;
+  int upper_bound = n_samples/processing_batch_size;
+  if (n_samples%processing_batch_size == 0) {
+    upper_bound = upper_bound-1;
+  }
+  // guarantee there are at least one iteration
+  upper_bound = upper_bound > 0? upper_bound:0;
+  int bstart = 0;
+  for ( int i = 0; i <= upper_bound; i++ ) {
+    const int batchlen = min(processing_batch_size, n_samples - bstart);
+    const int nthreads = batchlen * in_dim;
+    // to make number of threads the same as input
+    const int threads_per_block = min(THREADS_PER_BLOCK, nthreads);
+    int nblocks = (nthreads + threads_per_block - 1) / threads_per_block;
+    cuda::sketch_forward_kernel<DType><<<nblocks, threads_per_block>>>(
+                                    nthreads, out_ptr+bstart*out_dim, h_ptr,
+                                    s_ptr, in_ptr+bstart*in_dim, batchlen,
+                                    in_dim, out_dim);
+    // cudaThreadSynchronize();
+    bstart = (i+1)*batchlen;
+  }
+}
+
+template<typename DType>
+inline void CountSketchBackward(const Tensor<gpu, 2, DType> &in_grad,
+                                const Tensor<gpu, 2, DType> &out_grad,
+                                const Tensor<gpu, 1, DType> &h,
+                                const Tensor<gpu, 1, DType> &s,
+                                const int n_samples,
+                                const int processing_batch_size,
+                                const int in_dim,
+                                const int out_dim) {
+  DType *in_grad_ptr = in_grad.dptr_;
+  const DType *out_grad_ptr = out_grad.dptr_;
+  const DType *h_ptr = h.dptr_;
+  const DType *s_ptr = s.dptr_;
+  int upper_bound = n_samples/processing_batch_size;
+  if (n_samples%processing_batch_size == 0) {
+    upper_bound = upper_bound-1;
+  }
+  // guarantee there are at least one iteration
+  upper_bound = upper_bound > 0? upper_bound:0;
+  int bstart = 0;
+  for ( int i = 0; i <= upper_bound; i++ ) {
+    const int batchlen = min(processing_batch_size, n_samples - bstart);
+    const int nthreads = batchlen * in_dim;
+    // to make number of threads the same as input
+    const int threads_per_block = min(THREADS_PER_BLOCK, nthreads);
+    int nblocks = (nthreads + threads_per_block - 1) / threads_per_block;
+    cuda::sketch_backward_kernel<DType><<<nblocks, threads_per_block>>>(
+                            nthreads, in_grad_ptr+bstart*in_dim, h_ptr,
+                            s_ptr, out_grad_ptr+bstart*out_dim, batchlen,
+                            in_dim, out_dim);
+    bstart = (i+1)*batchlen;
+  }
+}
+}  // namespace mshadow
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(CountSketchParam param, int dtype) {
+  Operator *op = NULL;
+  switch (dtype) {
+      case mshadow::kFloat32:
+          op = new CountSketchOp<gpu, float>(param);
+          break;
+      case mshadow::kFloat64:
+          op = new CountSketchOp<gpu, double>(param);
+          break;
+      case mshadow::kFloat16:
+          LOG(FATAL) << "float16 count sketch layer is currently"
+                  "not supported.";
+          break;
+      default:
+          LOG(FATAL) << "Unsupported type " << dtype;
+  }
+  return op;
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
new file mode 100644
index 000000000000..844b11a39dd7
--- /dev/null
+++ b/src/operator/contrib/fft-inl.h
@@ -0,0 +1,306 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file fft-inl.h
+ * \brief
+ * \author Chen Zhu
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_FFT_INL_H_
+#define MXNET_OPERATOR_CONTRIB_FFT_INL_H_
+#ifdef MSHADOW_USE_CUDNN
+#include <cufft.h>
+#endif
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include <iostream>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+namespace fft {
+enum fftOpInputs {kData};
+enum fftOpOutputs {kOutComplex};  // seperate the image and real parts at the moment
+enum fftOpResource {kTempSpace};  // might be requiered as we need to pad the real matrices
+}
+
+struct FFTParam : public dmlc::Parameter<FFTParam> {
+  int compute_size;  // the maximum size of sub-batch to be forwarded through FFT in one time
+  DMLC_DECLARE_PARAMETER(FFTParam) {
+    DMLC_DECLARE_FIELD(compute_size).set_default(128)
+    .describe("Maximum size of sub-batch to be forwarded at one time");
+  }
+};
+
+template<typename xpu, typename DType>
+class FFTOp : public Operator {
+ public:
+  explicit FFTOp(FFTParam p) {
+    this->param_ = p;
+    init_cufft_ = false;
+    dim_ = 0;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+
+    // the last dimention should be the dimension of fft vector
+    if (!init_cufft_) {
+      n_ffts = in_data[fft::kData].shape_.ProdShape(0, in_data[fft::kData].ndim()-1);
+      dim_ = in_data[fft::kData].shape_[in_data[fft::kData].ndim()-1];
+
+      stride_ = param_.compute_size*dim_;
+
+      init_cufft_ = true;
+
+      // will handle the (possibly) incomplete group later
+      num_compute = n_ffts / param_.compute_size;
+    }
+
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // const TShape& oshape = out_data[fft::kOutComplex].shape_;
+    const TShape& ishape = in_data[fft::kData].shape_;
+    const TShape& oshape = out_data[fft::kOutComplex].shape_;
+    Tensor<xpu, 2, DType> data = in_data[fft::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(n_ffts, dim_), s);
+    Tensor<xpu, 2, DType> out = out_data[fft::kOutComplex].get_with_shape<xpu, 2, DType>(
+          Shape2(n_ffts, dim_*2), s);
+
+    // need temp space to pad the data into complex numbers due to cufft interface
+    Tensor<xpu, 1, DType> workspace =
+            ctx.requested[fft::kTempSpace].get_space_typed<xpu, 1, DType>(
+                Shape1(param_.compute_size*dim_*2), s);
+    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                              Shape2(param_.compute_size, dim_*2), s);
+    #if MSHADOW_USE_CUDNN
+    // start fft
+    cufftHandle plan;
+    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
+    for (size_t idx=0; idx < num_compute; ++idx) {
+      complex_data = complex_pad_imag(data.Slice(idx*param_.compute_size,
+                                                 idx*param_.compute_size+param_.compute_size));
+
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(out.dptr_ + 2*idx*stride_);
+      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
+    }
+    cufftDestroy(plan);
+
+    // handle the remaining samples
+    size_t remain_num = n_ffts - param_.compute_size*num_compute;
+    if (remain_num > 0) {
+      cufftHandle plan_remain;
+      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
+                    CUFFT_C2C, remain_num);
+
+      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                          Shape2(remain_num, dim_*2), s);
+      complex_data = complex_pad_imag(data.Slice(
+          num_compute*param_.compute_size, num_compute*param_.compute_size+remain_num));
+
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(out.dptr_ + 2*num_compute*stride_);
+      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
+      cufftDestroy(plan_remain);
+    }
+    #endif
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    const TShape& ishape = in_grad[fft::kData].shape_;
+    const TShape& oshape = out_grad[fft::kOutComplex].shape_;
+    Tensor<xpu, 2, DType> gdata = in_grad[fft::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(n_ffts, dim_), s);
+    Tensor<xpu, 2, DType> grad = out_grad[fft::kOutComplex].get_with_shape<xpu, 2, DType>(
+          Shape2(n_ffts, dim_*2), s);
+    // need temp space to pad the data into complex numbers due to cufft interface
+    Tensor<xpu, 1, DType> workspace =
+            ctx.requested[fft::kTempSpace].get_space_typed<xpu, 1, DType>(
+                Shape1(param_.compute_size*dim_*2), s);
+    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                              Shape2(param_.compute_size, dim_*2), s);
+
+    // by default, we think forward is firstly conducted
+    // In this solution, out_grad must comes from a fft of real signal,
+    // so that it is Hermitian symmetric, giving a real output
+    // but if it is not, remember that we have implemented complex_take_real, and use this
+    #if MSHADOW_USE_CUDNN
+    cufftHandle plan;
+    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
+    for (size_t idx = 0; idx < num_compute; ++idx) {
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(grad.dptr_ + 2*idx*stride_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
+      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
+
+      Assign(gdata.Slice(idx*param_.compute_size, (idx+1)*param_.compute_size),
+             req[fft::kData], complex_toreal(complex_data));
+    }
+    cufftDestroy(plan);
+
+    // handle the remaining samples
+    size_t remain_num = n_ffts - param_.compute_size*num_compute;
+    if (remain_num > 0) {
+      cufftHandle plan_remain;
+      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
+                    CUFFT_C2C, remain_num);
+      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                              Shape2(remain_num, dim_*2), s);
+
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(grad.dptr_ + 2*num_compute*stride_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
+      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
+
+      Assign(gdata.Slice(param_.compute_size*num_compute,
+                         param_.compute_size*num_compute+remain_num),
+             req[fft::kData], complex_toreal(complex_data));
+      cufftDestroy(plan_remain);
+    }
+    #endif
+    // for bp, we should not divide it
+    // but for comparison with np.fft.ifft, we should do it.
+    // gdata /= dim_;
+  }
+
+ private:
+  FFTParam param_;
+  int dim_, stride_, num_compute, n_ffts;
+  bool init_cufft_;
+};  // class FFTOp
+
+// Declare Factory Function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(FFTParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class FFTProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
+    const TShape &dshape = (*in_shape)[fft::kData];
+    // require data to be known
+    if (dshape.ndim() == 0) return false;
+
+    out_shape->clear();
+    if (dshape.ndim() == 4) {
+      out_shape->push_back(Shape4(dshape[0], dshape[1], dshape[2], dshape[3]*2));
+    } else if (dshape.ndim() == 2) {
+      out_shape->push_back(Shape2(dshape[0], dshape[1]*2));
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    FFTProp* fft_sym = new FFTProp();
+    fft_sym->param_ = this->param_;
+    return fft_sym;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_fft";
+  }
+
+  // declare dependency and inplace optimization options
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[fft::kOutComplex], in_data[fft::kData]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{in_data[fft::kData], in_grad[fft::kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                              std::vector<int> *in_type) const override;
+
+ private:
+  FFTParam param_;
+};
+#endif
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_FFT_INL_H_
diff --git a/src/operator/contrib/fft.cc b/src/operator/contrib/fft.cc
new file mode 100644
index 000000000000..9fd557032b5f
--- /dev/null
+++ b/src/operator/contrib/fft.cc
@@ -0,0 +1,33 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file fft-inl.h
+ * \brief
+ * \author Chen Zhu
+*/
+#include "./fft-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>(FFTParam param, int dtype) {
+    LOG(FATAL) << "fft is only available for GPU.";
+    return NULL;
+}
+
+Operator *FFTProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                                    std::vector<int> *in_type) const {
+    std::vector<TShape> out_shape, aux_shape;
+    std::vector<int> out_type, aux_type;
+    CHECK(InferType(in_type, &out_type, &aux_type));
+    CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(FFTParam);
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_fft, FFTProp)
+.describe("Apply FFT to input.")
+.add_argument("data", "Symbol", "Input data to the FFTOp.")
+.add_arguments(FFTParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/fft.cu b/src/operator/contrib/fft.cu
new file mode 100644
index 000000000000..5dbd00c779fd
--- /dev/null
+++ b/src/operator/contrib/fft.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file fft-inl.h
+ * \brief
+ * \author Chen Zhu
+ */
+#include "./fft-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator* CreateOp<gpu>(FFTParam param, int dtype) {
+    Operator *op = NULL;
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+        op = new FFTOp<gpu, DType>(param);
+    })
+    return op;
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
new file mode 100644
index 000000000000..07c41dee084d
--- /dev/null
+++ b/src/operator/contrib/ifft-inl.h
@@ -0,0 +1,299 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file Ifft-inl.h
+ * \brief
+ * \author Chen Zhu
+*/
+
+#ifndef MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
+#define MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
+#ifdef MSHADOW_USE_CUDNN
+#include <cufft.h>
+#endif
+#include <stdio.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+
+
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+namespace ifft {
+  enum ifftOpInputs {kData};  // input should represent complex
+  enum ifftOpOutputs {kOut};  // output should be real
+  enum ifftOpResource {kTempSpace};
+}
+
+struct IFFTParam : public dmlc::Parameter<IFFTParam> {
+  int compute_size;  // the maximum size of sub-batch to be forwarded through cufft in one time
+  DMLC_DECLARE_PARAMETER(IFFTParam){
+    DMLC_DECLARE_FIELD(compute_size).set_default(128)
+    .describe("Maximum size of sub-batch to be forwarded at one time");
+  }
+};
+
+template<typename xpu, typename DType>
+class IFFTOp : public Operator {
+ public:
+  explicit IFFTOp(IFFTParam p) {
+    this->param_ = p;
+    init_cufft_ = false;
+    dim_ = 0;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+
+    if (!init_cufft_) {
+      n_iffts = in_data[ifft::kData].shape_.ProdShape(0, in_data[ifft::kData].ndim()-1);
+      // remember that input is complex
+      dim_ = in_data[ifft::kData].shape_[in_data[ifft::kData].ndim()-1]/2;
+      // stride_ in the number of complex numbers
+      stride_ = param_.compute_size*dim_;
+
+      init_cufft_ = true;
+
+      num_compute = n_iffts/param_.compute_size;
+    }
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const TShape& ishape = in_data[ifft::kData].shape_;
+    const TShape& oshape = out_data[ifft::kOut].shape_;
+    Tensor<xpu, 2, DType> data = in_data[ifft::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(n_iffts, dim_*2), s);
+    Tensor<xpu, 2, DType> out = out_data[ifft::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(n_iffts, dim_), s);
+    // need temp space to store the intermediate complex matrices
+    Tensor<xpu, 1, DType> workspace =
+            ctx.requested[ifft::kTempSpace].get_space_typed<xpu, 1, DType>(
+                Shape1(param_.compute_size*dim_*2), s);
+    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                              Shape2(param_.compute_size, dim_*2), s);
+    #if MSHADOW_USE_CUDNN
+    // start ifft
+    cufftHandle plan;
+    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
+    for (size_t idx=0; idx < num_compute; ++idx) {
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(data.dptr_ + 2*idx*stride_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
+      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
+
+      Assign(out.Slice(idx*param_.compute_size, (idx+1)*param_.compute_size),
+             req[ifft::kOut], complex_toreal(complex_data));
+    }
+    cufftDestroy(plan);
+    // handle the remaining samples
+    size_t remain_num = n_iffts - param_.compute_size*num_compute;
+    if (remain_num > 0) {
+      cufftHandle plan_remain;
+      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
+                    CUFFT_C2C, remain_num);
+
+      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                              Shape2(remain_num, dim_*2), s);
+
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(data.dptr_ + 2*num_compute*stride_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(complex_data.dptr_);
+      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_INVERSE), CUFFT_SUCCESS);
+        Assign(out.Slice(param_.compute_size*num_compute,
+                         param_.compute_size*num_compute+remain_num),
+             req[ifft::kOut], complex_toreal(complex_data));
+      cufftDestroy(plan_remain);
+    }
+    #endif
+    // commenting this out to be consistant with caffe
+    // out /= dim_;
+  }
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK(in_data.size() == 1 && in_grad.size() == 1);
+    CHECK_EQ(req.size(), 1);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    const TShape& ishape = in_grad[ifft::kData].shape_;
+    const TShape& oshape = out_grad[ifft::kOut].shape_;
+    Tensor<xpu, 2, DType> gdata = in_grad[ifft::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(n_iffts, dim_*2), s);
+    Tensor<xpu, 2, DType> grad = out_grad[ifft::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(n_iffts, dim_), s);
+    // need temp space to pad the data into complex numbers due to cufft interface
+    Tensor<xpu, 1, DType> workspace =
+            ctx.requested[ifft::kTempSpace].get_space_typed<xpu, 1, DType>(
+                Shape1(param_.compute_size*dim_*2), s);
+    Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                              Shape2(param_.compute_size, dim_*2), s);
+    #if MSHADOW_USE_CUDNN
+    // start fft
+    cufftHandle plan;
+    cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
+    for (size_t idx = 0; idx < num_compute; ++idx) {
+      complex_data = complex_pad_imag(grad.Slice(idx*param_.compute_size,
+                                                 idx*param_.compute_size+param_.compute_size));
+
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(gdata.dptr_ + 2*idx*stride_);
+      CHECK_EQ(cufftExecC2C(plan, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
+    }
+    cufftDestroy(plan);
+
+    // handle the remaining samples
+    size_t remain_num = n_iffts - param_.compute_size*num_compute;
+    if (remain_num > 0) {
+      cufftHandle plan_remain;
+      cufftPlanMany(&plan_remain, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0,
+                    CUFFT_C2C, remain_num);
+      complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                          Shape2(remain_num, dim_*2), s);
+      complex_data = complex_pad_imag(grad.Slice(
+          num_compute*param_.compute_size, num_compute*param_.compute_size+remain_num));
+
+      cufftComplex* in_tmp = const_cast<cufftComplex*>(
+        reinterpret_cast<const cufftComplex*>(complex_data.dptr_));
+      cufftComplex* out_tmp = reinterpret_cast<cufftComplex*>(gdata.dptr_ + 2*num_compute*stride_);
+      CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
+      cufftDestroy(plan_remain);
+    }
+    #endif
+    // commenting this out to be consistant with caffe
+    // gdata /= dim_;
+  }
+
+ private:
+  IFFTParam param_;
+  int dim_, stride_, num_compute, n_iffts;
+  bool init_cufft_;
+};  // class IFFTOp
+
+// Declare Factory Function, used for dispatch specialization
+template<typename xpu>
+Operator* CreateOp(IFFTParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class IFFTProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return {"data"};
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 1) <<"Input:[data]";
+    const TShape &dshape = (*in_shape)[ifft::kData];
+    // require data to be known
+    if (dshape.ndim() == 0) return false;
+
+    out_shape->clear();
+    if (dshape.ndim() == 4) {
+      out_shape->push_back(Shape4(dshape[0], dshape[1], dshape[2], dshape[3]/2));
+    } else if (dshape.ndim() == 2) {
+      out_shape->push_back(Shape2(dshape[0], dshape[1]/2));
+    } else {
+      return false;
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i=0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    IFFTProp* ifft_sym = new IFFTProp();
+    ifft_sym->param_ = this->param_;
+    return ifft_sym;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_ifft";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[ifft::kOut], in_data[ifft::kData]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+    return {{in_data[ifft::kData], in_grad[ifft::kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                              std::vector<int> *in_type) const override;
+
+ private:
+  IFFTParam param_;
+};
+#endif
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_IFFT_INL_H_
diff --git a/src/operator/contrib/ifft.cc b/src/operator/contrib/ifft.cc
new file mode 100644
index 000000000000..69b775972752
--- /dev/null
+++ b/src/operator/contrib/ifft.cc
@@ -0,0 +1,34 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file Ifft-inl.h
+ * \brief
+ * \author Chen Zhu
+*/
+
+#include "./ifft-inl.h"
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(IFFTParam param, int dtype) {
+    LOG(FATAL) << "ifft is only available for GPU.";
+    return NULL;
+}
+
+Operator *IFFTProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                                    std::vector<int> *in_type) const {
+    std::vector<TShape> out_shape, aux_shape;
+    std::vector<int> out_type, aux_type;
+    CHECK(InferType(in_type, &out_type, &aux_type));
+    CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(IFFTParam);
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_ifft, IFFTProp)
+.describe("Apply IFFT to input.")
+.add_argument("data", "Symbol", "Input data to the IFFTOp.")
+.add_arguments(IFFTParam::__FIELDS__());
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/ifft.cu b/src/operator/contrib/ifft.cu
new file mode 100644
index 000000000000..93ec1e636a3b
--- /dev/null
+++ b/src/operator/contrib/ifft.cu
@@ -0,0 +1,21 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file Ifft-inl.h
+ * \brief
+ * \author Chen Zhu
+*/
+
+#include "./ifft-inl.h"
+namespace mxnet {
+namespace op {
+
+template<>
+Operator* CreateOp<gpu>(IFFTParam param, int dtype) {
+    Operator *op = NULL;
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+        op = new IFFTOp<gpu, DType>(param);
+    })
+    return op;
+}
+}  // namespace op
+}  // namespace mxnet
diff --git a/example/ssd/operator/multibox_detection-inl.h b/src/operator/contrib/multibox_detection-inl.h
similarity index 69%
rename from example/ssd/operator/multibox_detection-inl.h
rename to src/operator/contrib/multibox_detection-inl.h
index 72185aeb8066..3507281eba10 100644
--- a/example/ssd/operator/multibox_detection-inl.h
+++ b/src/operator/contrib/multibox_detection-inl.h
@@ -4,18 +4,19 @@
  * \brief post-process multibox detection predictions
  * \author Joshua Zhang
 */
-#ifndef MXNET_OPERATOR_MULTIBOX_DETECTION_INL_H_
-#define MXNET_OPERATOR_MULTIBOX_DETECTION_INL_H_
+#ifndef MXNET_OPERATOR_CONTRIB_MULTIBOX_DETECTION_INL_H_
+#define MXNET_OPERATOR_CONTRIB_MULTIBOX_DETECTION_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <mxnet/base.h>
+#include <nnvm/tuple.h>
 #include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include <valarray>
-#include "./operator_common.h"
+#include "../operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -25,82 +26,15 @@ enum MultiBoxDetectionOpOutputs {kOut};
 enum MultiBoxDetectionOpResource {kTempSpace};
 }  // namespace mboxdet_enum
 
-struct VarInfo {
-  VarInfo() {}
-  explicit VarInfo(std::vector<float> in) : info(in) {}
-
-  std::vector<float> info;
-};  // struct VarInfo
-
-inline std::istream &operator>>(std::istream &is, VarInfo &size) {
-  while (true) {
-    char ch = is.get();
-    if (ch == '(') break;
-    if (!isspace(ch)) {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  float f;
-  std::vector<float> tmp;
-  // deal with empty case
-  // safe to remove after stop using target_size
-  size_t pos = is.tellg();
-  char ch = is.get();
-  if (ch == ')') {
-    size.info = tmp;
-    return is;
-  }
-  is.seekg(pos);
-  // finish deal
-  while (is >> f) {
-    tmp.push_back(f);
-    char ch;
-    do {
-      ch = is.get();
-    } while (isspace(ch));
-    if (ch == ',') {
-      while (true) {
-        ch = is.peek();
-        if (isspace(ch)) {
-          is.get(); continue;
-        }
-        if (ch == ')') {
-          is.get(); break;
-        }
-        break;
-      }
-      if (ch == ')') break;
-    } else if (ch == ')') {
-      break;
-    } else {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  size.info = tmp;
-  return is;
-}
-
-inline std::ostream &operator<<(std::ostream &os, const VarInfo &size) {
-  os << '(';
-  for (index_t i = 0; i < size.info.size(); ++i) {
-    if (i != 0) os << ',';
-    os << size.info[i];
-  }
-  // python style tuple
-  if (size.info.size() == 1) os << ',';
-  os << ')';
-  return os;
-}
-
 struct MultiBoxDetectionParam : public dmlc::Parameter<MultiBoxDetectionParam> {
   bool clip;
   float threshold;
   int background_id;
   float nms_threshold;
   bool force_suppress;
-  VarInfo variances;
+  int keep_topk;
+  int nms_topk;
+  nnvm::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxDetectionParam) {
     DMLC_DECLARE_FIELD(clip).set_default(true)
     .describe("Clip out-of-boundary boxes.");
@@ -112,8 +46,10 @@ struct MultiBoxDetectionParam : public dmlc::Parameter<MultiBoxDetectionParam> {
     .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
     .describe("Suppress all detections regardless of class_id.");
-    DMLC_DECLARE_FIELD(variances).set_default(VarInfo({0.1f, 0.1f, 0.2f, 0.2f}))
+    DMLC_DECLARE_FIELD(variances).set_default({0.1f, 0.1f, 0.2f, 0.2f})
     .describe("Variances to be decoded from box regression output.");
+    DMLC_DECLARE_FIELD(nms_topk).set_default(-1)
+    .describe("Keep maximum top k detections before nms, -1 for no limit.");
   }
 };  // struct MultiBoxDetectionParam
 
@@ -131,9 +67,9 @@ class MultiBoxDetectionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
      using namespace mshadow;
      using namespace mshadow::expr;
-     CHECK_EQ(in_data.size(), 3) << "Input: [cls_prob, loc_pred, anchor]";
+     CHECK_EQ(in_data.size(), 3U) << "Input: [cls_prob, loc_pred, anchor]";
      TShape ashape = in_data[mboxdet_enum::kAnchor].shape_;
-     CHECK_EQ(out_data.size(), 1);
+     CHECK_EQ(out_data.size(), 1U);
 
      Stream<xpu> *s = ctx.get_stream<xpu>();
      Tensor<xpu, 3, DType> cls_prob = in_data[mboxdet_enum::kClsProb]
@@ -148,8 +84,8 @@ class MultiBoxDetectionOp : public Operator {
        .get_space_typed<xpu, 3, DType>(out.shape_, s);
      out = -1.f;
      MultiBoxDetectionForward(out, cls_prob, loc_pred, anchors, temp_space,
-       param_.threshold, param_.clip, param_.variances.info, param_.nms_threshold,
-       param_.force_suppress);
+       param_.threshold, param_.clip, param_.variances, param_.nms_threshold,
+       param_.force_suppress, param_.nms_topk);
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -161,6 +97,13 @@ class MultiBoxDetectionOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 2, DType> gradc = in_grad[mboxdet_enum::kClsProb].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> gradl = in_grad[mboxdet_enum::kLocPred].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grada = in_grad[mboxdet_enum::kAnchor].FlatTo2D<xpu, DType>(s);
+    gradc = 0.f;
+    gradl = 0.f;
+    grada = 0.f;
 }
 
  private:
@@ -189,17 +132,17 @@ class MultiBoxDetectionProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3) << "Inputs: [cls_prob, loc_pred, anchor]";
+    CHECK_EQ(in_shape->size(), 3U) << "Inputs: [cls_prob, loc_pred, anchor]";
     TShape cshape = in_shape->at(mboxdet_enum::kClsProb);
     TShape lshape = in_shape->at(mboxdet_enum::kLocPred);
     TShape ashape = in_shape->at(mboxdet_enum::kAnchor);
-    CHECK_EQ(cshape.ndim(), 3) << "Provided: " << cshape;
-    CHECK_EQ(lshape.ndim(), 2) << "Provided: " << lshape;
-    CHECK_EQ(ashape.ndim(), 3) << "Provided: " << ashape;
+    CHECK_EQ(cshape.ndim(), 3U) << "Provided: " << cshape;
+    CHECK_EQ(lshape.ndim(), 2U) << "Provided: " << lshape;
+    CHECK_EQ(ashape.ndim(), 3U) << "Provided: " << ashape;
     CHECK_EQ(cshape[2], ashape[1]) << "Number of anchors mismatch";
     CHECK_EQ(cshape[2] * 4, lshape[1]) << "# anchors mismatch with # loc";
-    CHECK_GT(ashape[1], 0) << "Number of anchors must > 0";
-    CHECK_EQ(ashape[2], 4);
+    CHECK_GT(ashape[1], 0U) << "Number of anchors must > 0";
+    CHECK_EQ(ashape[2], 4U);
     TShape oshape = TShape(3);
     oshape[0] = cshape[0];
     oshape[1] = ashape[1];
@@ -216,7 +159,7 @@ class MultiBoxDetectionProp : public OperatorProperty {
   }
 
   std::string TypeString() const override {
-    return "MultiBoxDetection";
+    return "_contrib_MultiBoxDetection";
   }
 
   std::vector<ResourceRequest> ForwardResource(
@@ -240,4 +183,4 @@ class MultiBoxDetectionProp : public OperatorProperty {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_MULTIBOX_DETECTION_INL_H_
+#endif  // MXNET_OPERATOR_CONTRIB_MULTIBOX_DETECTION_INL_H_
diff --git a/example/ssd/operator/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
similarity index 90%
rename from example/ssd/operator/multibox_detection.cc
rename to src/operator/contrib/multibox_detection.cc
index 38bcc54d286e..b897319a7aa3 100644
--- a/example/ssd/operator/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -66,10 +66,13 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
                                      const Tensor<cpu, 2, DType> &loc_pred,
                                      const Tensor<cpu, 2, DType> &anchors,
                                      const Tensor<cpu, 3, DType> &temp_space,
-                                     float threshold, bool clip,
-                                     const std::vector<float> &variances,
-                                     float nms_threshold, bool force_suppress) {
-  CHECK_EQ(variances.size(), 4) << "Variance size must be 4";
+                                     const float threshold,
+                                     const bool clip,
+                                     const nnvm::Tuple<float> &variances,
+                                     const float nms_threshold,
+                                     const bool force_suppress,
+                                     const int nms_topk) {
+  CHECK_EQ(variances.ndim(), 4) << "Variance size must be 4";
   const int num_classes = cls_prob.size(1);
   const int num_anchors = cls_prob.size(2);
   const int num_batches = cls_prob.size(0);
@@ -118,7 +121,11 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
     std::stable_sort(sorter.begin(), sorter.end());
     // re-order output
     DType *ptemp = temp_space.dptr_ + nbatch * num_anchors * 6;
-    for (std::size_t i = 0; i < sorter.size(); ++i) {
+    int nkeep = static_cast<int>(sorter.size());
+    if (nms_topk > 0 && nms_topk < nkeep) {
+      nkeep = nms_topk;
+    }
+    for (int i = 0; i < nkeep; ++i) {
       for (int j = 0; j < 6; ++j) {
         p_out[i * 6 + j] = ptemp[sorter[i].index * 6 + j];
       }
@@ -165,7 +172,7 @@ Operator* MultiBoxDetectionProp::CreateOperatorEx(Context ctx,
 }
 
 DMLC_REGISTER_PARAMETER(MultiBoxDetectionParam);
-MXNET_REGISTER_OP_PROPERTY(MultiBoxDetection, MultiBoxDetectionProp)
+MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxDetection, MultiBoxDetectionProp)
 .describe("Convert multibox detection predictions.")
 .add_argument("cls_prob", "Symbol", "Class probabilities.")
 .add_argument("loc_pred", "Symbol", "Location regression predictions.")
diff --git a/example/ssd/operator/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
similarity index 89%
rename from example/ssd/operator/multibox_detection.cu
rename to src/operator/contrib/multibox_detection.cu
index f8b3ca883831..dab11ffbe701 100644
--- a/example/ssd/operator/multibox_detection.cu
+++ b/src/operator/contrib/multibox_detection.cu
@@ -39,7 +39,7 @@ __global__ void DetectionForwardKernel(DType *out, const DType *cls_prob,
                                        const bool clip, const float vx,
                                        const float vy, const float vw,
                                        const float vh, const float nms_threshold,
-                                       const bool force_suppress) {
+                                       const bool force_suppress, const int nms_topk) {
   const int nbatch = blockIdx.x;  // each block for each batch
   int index = threadIdx.x;
   __shared__ int valid_count;
@@ -153,12 +153,22 @@ __global__ void DetectionForwardKernel(DType *out, const DType *cls_prob,
     __syncthreads();
   }
 
+  // keep top k detections
+  int ntop = size;
+  if (nms_topk > 0 && nms_topk < ntop) {
+    ntop = nms_topk;
+    for (int i = ntop + index; i < size; i += blockDim.x) {
+      out[i * 6] = -1;
+    }
+    __syncthreads();
+  }
+
   // apply NMS
-  for (int compare_pos = 0; compare_pos < size; ++compare_pos) {
+  for (int compare_pos = 0; compare_pos < ntop; ++compare_pos) {
     DType compare_id = out[compare_pos * 6];
     if (compare_id < 0) continue;  // not a valid positive detection, skip
     DType *compare_loc_ptr = out + compare_pos * 6 + 2;
-    for (int i = compare_pos + index + 1; i < size; i += blockDim.x) {
+    for (int i = compare_pos + index + 1; i < ntop; i += blockDim.x) {
       DType class_id = out[i * 6];
       if (class_id < 0) continue;
       if (force_suppress || (class_id == compare_id)) {
@@ -180,10 +190,13 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
                                      const Tensor<gpu, 2, DType> &loc_pred,
                                      const Tensor<gpu, 2, DType> &anchors,
                                      const Tensor<gpu, 3, DType> &temp_space,
-                                     float threshold, bool clip,
-                                     const std::vector<float> &variances,
-                                     float nms_threshold, bool force_suppress) {
-  CHECK_EQ(variances.size(), 4) << "Variance size must be 4";
+                                     const float threshold,
+                                     const bool clip,
+                                     const nnvm::Tuple<float> &variances,
+                                     const float nms_threshold,
+                                     const bool force_suppress,
+                                     const int nms_topk) {
+  CHECK_EQ(variances.ndim(), 4) << "Variance size must be 4";
   const int num_classes = cls_prob.size(1);
   const int num_anchors = cls_prob.size(2);
   const int num_batches = cls_prob.size(0);
@@ -195,7 +208,7 @@ inline void MultiBoxDetectionForward(const Tensor<gpu, 3, DType> &out,
     cls_prob.dptr_, loc_pred.dptr_, anchors.dptr_, temp_space.dptr_,
     num_classes, num_anchors, threshold, clip,
     variances[0], variances[1], variances[2], variances[3],
-    nms_threshold, force_suppress);
+    nms_threshold, force_suppress, nms_topk);
   MULTIBOX_DETECTION_CUDA_CHECK(cudaPeekAtLastError());
 }
 }  // namespace mshadow
diff --git a/example/ssd/operator/multibox_prior-inl.h b/src/operator/contrib/multibox_prior-inl.h
similarity index 71%
rename from example/ssd/operator/multibox_prior-inl.h
rename to src/operator/contrib/multibox_prior-inl.h
index 3c947326179e..ee83fe462ce4 100644
--- a/example/ssd/operator/multibox_prior-inl.h
+++ b/src/operator/contrib/multibox_prior-inl.h
@@ -4,18 +4,19 @@
  * \brief generate multibox prior boxes
  * \author Joshua Zhang
 */
-#ifndef MXNET_OPERATOR_MULTIBOX_PRIOR_INL_H_
-#define MXNET_OPERATOR_MULTIBOX_PRIOR_INL_H_
+#ifndef MXNET_OPERATOR_CONTRIB_MULTIBOX_PRIOR_INL_H_
+#define MXNET_OPERATOR_CONTRIB_MULTIBOX_PRIOR_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <mxnet/base.h>
+#include <nnvm/tuple.h>
 #include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include <valarray>
-#include "./operator_common.h"
+#include "../operator_common.h"
 
 
 namespace mxnet {
@@ -37,86 +38,23 @@ enum MultiBoxPriorOpInputs {kData};
 enum MultiBoxPriorOpOutputs {kOut};
 }  // namespace mboxprior_enum
 
-struct SizeInfo {
-  SizeInfo() {}
-  explicit SizeInfo(std::vector<float> in) : info(in) {}
-
-  std::vector<float> info;
-};  // struct SizeInfo
-
-inline std::istream &operator>>(std::istream &is, SizeInfo &size) {
-  while (true) {
-    char ch = is.get();
-    if (ch == '(') break;
-    if (!isspace(ch)) {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  float f;
-  std::vector<float> tmp;
-  // deal with empty case
-  // safe to remove after stop using target_size
-  size_t pos = is.tellg();
-  char ch = is.get();
-  if (ch == ')') {
-    size.info = tmp;
-    return is;
-  }
-  is.seekg(pos);
-  // finish deal
-  while (is >> f) {
-    tmp.push_back(f);
-    char ch;
-    do {
-      ch = is.get();
-    } while (isspace(ch));
-    if (ch == ',') {
-      while (true) {
-        ch = is.peek();
-        if (isspace(ch)) {
-          is.get(); continue;
-        }
-        if (ch == ')') {
-          is.get(); break;
-        }
-        break;
-      }
-      if (ch == ')') break;
-    } else if (ch == ')') {
-      break;
-    } else {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  size.info = tmp;
-  return is;
-}
-
-inline std::ostream &operator<<(std::ostream &os, const SizeInfo &size) {
-  os << '(';
-  for (index_t i = 0; i < size.info.size(); ++i) {
-    if (i != 0) os << ',';
-    os << size.info[i];
-  }
-  // python style tuple
-  if (size.info.size() == 1) os << ',';
-  os << ')';
-  return os;
-}
-
 struct MultiBoxPriorParam : public dmlc::Parameter<MultiBoxPriorParam> {
-  SizeInfo sizes;
-  SizeInfo ratios;
+  nnvm::Tuple<float> sizes;
+  nnvm::Tuple<float> ratios;
   bool clip;
+  nnvm::Tuple<float> steps;
+  nnvm::Tuple<float> offsets;
   DMLC_DECLARE_PARAMETER(MultiBoxPriorParam) {
-    DMLC_DECLARE_FIELD(sizes).set_default(SizeInfo({1.0f}))
+    DMLC_DECLARE_FIELD(sizes).set_default({1.0f})
     .describe("List of sizes of generated MultiBoxPriores.");
-    DMLC_DECLARE_FIELD(ratios).set_default(SizeInfo({1.0f}))
+    DMLC_DECLARE_FIELD(ratios).set_default({1.0f})
     .describe("List of aspect ratios of generated MultiBoxPriores.");
     DMLC_DECLARE_FIELD(clip).set_default(false)
     .describe("Whether to clip out-of-boundary boxes.");
+    DMLC_DECLARE_FIELD(steps).set_default({-1.f, -1.f})
+    .describe("Priorbox step across y and x, -1 for auto calculation.");
+    DMLC_DECLARE_FIELD(offsets).set_default({0.5f, 0.5f})
+    .describe("Priorbox center offsets, y and x respectively");
   }
 };  // struct MultiBoxPriorParam
 
@@ -124,9 +62,18 @@ template<typename xpu, typename DType>
 class MultiBoxPriorOp : public Operator {
  public:
   explicit MultiBoxPriorOp(MultiBoxPriorParam param)
-    : clip_(param.clip), sizes_(param.sizes.info), ratios_(param.ratios.info) {
+    : clip_(param.clip), sizes_(param.sizes.begin(), param.sizes.end()),
+    ratios_(param.ratios.begin(), param.ratios.end()),
+    steps_(param.steps.begin(), param.steps.end()),
+    offsets_(param.offsets.begin(), param.offsets.end()) {
       CHECK_GT(sizes_.size(), 0);
       CHECK_GT(ratios_.size(), 0);
+      CHECK_EQ(steps_.size(), 2);
+      CHECK_EQ(offsets_.size(), 2);
+      CHECK_GE(offsets_[0], 0.f);
+      CHECK_LE(offsets_[0], 1.f);
+      CHECK_GE(offsets_[1], 0.f);
+      CHECK_LE(offsets_[1], 1.f);
     }
 
   virtual void Forward(const OpContext &ctx,
@@ -137,23 +84,26 @@ class MultiBoxPriorOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), 1);
-    CHECK_GE(in_data[mboxprior_enum::kData].ndim(), 4);  // require spatial information
-    int in_height = in_data[mboxprior_enum::kData].size(2);
-    CHECK_GT(in_height, 0);
-    int in_width = in_data[mboxprior_enum::kData].size(3);
-    CHECK_GT(in_width, 0);
     CHECK_EQ(out_data.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> out;
-    // TODO(Joshua Zhang): this implementation is to be compliant to original ssd in caffe
+    // TODO(zhreshold): this implementation is to be compliant to original ssd in caffe
     // The prior boxes could be implemented in more versatile ways
     // since input sizes are same in each batch, we could share MultiBoxPrior
     const int num_sizes = static_cast<int>(sizes_.size());
     const int num_ratios = static_cast<int>(ratios_.size());
     const int num_anchors = num_sizes - 1 + num_ratios;  // anchors per location
+    int in_height = in_data[mboxprior_enum::kData].size(2);
+    int in_width = in_data[mboxprior_enum::kData].size(3);
     Shape<2> oshape = Shape2(num_anchors * in_width * in_height, 4);
     out = out_data[mboxprior_enum::kOut].get_with_shape<xpu, 2, DType>(oshape, s);
-    MultiBoxPriorForward(out, sizes_, ratios_, in_width, in_height);
+    CHECK_GE(steps_[0] * steps_[1], 0) << "Must specify both step_y and step_x";
+    if (steps_[0] <= 0 || steps_[1] <= 0) {
+      // estimate using layer shape
+      steps_[0] = 1.f / in_height;
+      steps_[1] = 1.f / in_width;
+    }
+    MultiBoxPriorForward(out, sizes_, ratios_, in_width, in_height, steps_, offsets_);
 
     if (clip_) {
       Assign(out, req[mboxprior_enum::kOut], F<mshadow_op::clip_zero_one>(out));
@@ -178,6 +128,8 @@ class MultiBoxPriorOp : public Operator {
   bool clip_;
   std::vector<float> sizes_;
   std::vector<float> ratios_;
+  std::vector<float> steps_;
+  std::vector<float> offsets_;
 };  // class MultiBoxPriorOp
 
 template<typename xpu>
@@ -211,13 +163,14 @@ class MultiBoxPriorProp: public OperatorProperty {
     CHECK_GT(in_width, 0) << "Input width should > 0";
     // since input sizes are same in each batch, we could share MultiBoxPrior
     TShape oshape = TShape(3);
-    int num_sizes = param_.sizes.info.size();
-    int num_ratios = param_.ratios.info.size();
+    int num_sizes = param_.sizes.ndim();
+    int num_ratios = param_.ratios.ndim();
     oshape[0] = 1;
     oshape[1] = in_height * in_width * (num_sizes + num_ratios - 1);
     oshape[2] = 4;
     out_shape->clear();
     out_shape->push_back(oshape);
+    CHECK_EQ(param_.steps.ndim(), 2) << "Step ndim must be 2: (step_y, step_x)";
     return true;
   }
 
@@ -228,7 +181,7 @@ class MultiBoxPriorProp: public OperatorProperty {
   }
 
   std::string TypeString() const override {
-    return "MultiBoxPrior";
+    return "_contrib_MultiBoxPrior";
   }
 
   Operator* CreateOperator(Context ctx) const override {
@@ -247,4 +200,4 @@ class MultiBoxPriorProp: public OperatorProperty {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_MULTIBOX_PRIOR_INL_H_
+#endif  // MXNET_OPERATOR_CONTRIB_MULTIBOX_PRIOR_INL_H_
diff --git a/example/ssd/operator/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
similarity index 87%
rename from example/ssd/operator/multibox_prior.cc
rename to src/operator/contrib/multibox_prior.cc
index f8be6d54bf59..b9a7bc71d05d 100644
--- a/example/ssd/operator/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -12,17 +12,19 @@ template<typename DType>
 inline void MultiBoxPriorForward(const Tensor<cpu, 2, DType> &out,
                             const std::vector<float> &sizes,
                             const std::vector<float> &ratios,
-                            const int in_width, const int in_height) {
-  const float step_x = 1.f / in_width;
-  const float step_y = 1.f / in_height;
+                            const int in_width, const int in_height,
+                            const std::vector<float> &steps,
+                            const std::vector<float> &offsets) {
+  const float step_x = steps[1];
+  const float step_y = steps[0];
   const int num_sizes = static_cast<int>(sizes.size());
   const int num_ratios = static_cast<int>(ratios.size());
   int count = 0;
 
   for (int r = 0; r < in_height; ++r) {
-    float center_y = (r + 0.5) * step_y;
+    float center_y = (r + offsets[0]) * step_y;
     for (int c = 0; c < in_width; ++c) {
-      float center_x = (c + 0.5) * step_x;
+      float center_x = (c + offsets[1]) * step_x;
       // ratio = 1, various sizes
       for (int i = 0; i < num_sizes; ++i) {
         float size = sizes[i];
@@ -73,7 +75,7 @@ Operator* MultiBoxPriorProp::CreateOperatorEx(Context ctx, std::vector<TShape> *
 
 DMLC_REGISTER_PARAMETER(MultiBoxPriorParam);
 
-MXNET_REGISTER_OP_PROPERTY(MultiBoxPrior, MultiBoxPriorProp)
+MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxPrior, MultiBoxPriorProp)
 .add_argument("data", "Symbol", "Input data.")
 .add_arguments(MultiBoxPriorParam::__FIELDS__())
 .describe("Generate prior(anchor) boxes from data, sizes and ratios.");
diff --git a/example/ssd/operator/multibox_prior.cu b/src/operator/contrib/multibox_prior.cu
similarity index 81%
rename from example/ssd/operator/multibox_prior.cu
rename to src/operator/contrib/multibox_prior.cu
index bf8cf7f0696f..a3f2cc22f552 100644
--- a/example/ssd/operator/multibox_prior.cu
+++ b/src/operator/contrib/multibox_prior.cu
@@ -21,14 +21,15 @@ template<typename DType>
 __global__ void AssignPriors(DType *out, const float size,
                              const float sqrt_ratio, const int in_width,
                              const int in_height, const float step_x,
-                             const float step_y, const int stride,
+                             const float step_y, const float center_offy,
+                             const float center_offx, const int stride,
                              const int offset) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= in_width * in_height) return;
   int r = index / in_width;
   int c = index % in_width;
-  float center_x = (c + 0.5) * step_x;
-  float center_y = (r + 0.5) * step_y;
+  float center_x = (c + center_offx) * step_x;
+  float center_y = (r + center_offy) * step_y;
   float w = size * sqrt_ratio / 2;  // half width
   float h = size / sqrt_ratio / 2;  // half height
   DType *ptr = out + index * stride + 4 * offset;
@@ -43,12 +44,16 @@ template<typename DType>
 inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
                             const std::vector<float> &sizes,
                             const std::vector<float> &ratios,
-                            const int in_width, const int in_height) {
+                            const int in_width, const int in_height,
+                            const std::vector<float> &steps,
+                            const std::vector<float> &offsets) {
   CHECK_EQ(out.CheckContiguous(), true);
   cudaStream_t stream = Stream<gpu>::GetStream(out.stream_);
   DType *out_ptr = out.dptr_;
-  const float step_x = 1.f / in_width;
-  const float step_y = 1.f / in_height;
+  const float step_x = steps[1];
+  const float step_y = steps[0];
+  const float offset_x = offsets[1];
+  const float offset_y = offsets[0];
   const int num_sizes = static_cast<int>(sizes.size());
   const int num_ratios = static_cast<int>(ratios.size());
 
@@ -62,7 +67,7 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
   // ratio = 1, various sizes
   for (int i = 0; i < num_sizes; ++i) {
     cuda::AssignPriors<DType><<<dimGrid, dimBlock, 0, stream>>>(out_ptr,
-      sizes[i], 1.f, in_width, in_height, step_x, step_y, stride, offset);
+      sizes[i], 1.f, in_width, in_height, step_x, step_y, offset_y, offset_x, stride, offset);
     ++offset;
   }
   MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
@@ -70,7 +75,8 @@ inline void MultiBoxPriorForward(const Tensor<gpu, 2, DType> &out,
   // size = sizes[0], various ratios
   for (int j = 1; j < num_ratios; ++j) {
     cuda::AssignPriors<DType><<<dimGrid, dimBlock, 0, stream>>>(out_ptr,
-      sizes[0], sqrtf(ratios[j]), in_width, in_height, step_x, step_y, stride, offset);
+      sizes[0], sqrtf(ratios[j]), in_width, in_height, step_x, step_y,
+       offset_y, offset_x, stride, offset);
     ++offset;
   }
   MULTIBOXPRIOR_CUDA_CHECK(cudaPeekAtLastError());
diff --git a/example/ssd/operator/multibox_target-inl.h b/src/operator/contrib/multibox_target-inl.h
similarity index 82%
rename from example/ssd/operator/multibox_target-inl.h
rename to src/operator/contrib/multibox_target-inl.h
index 65c9e73de936..7185c9a1d2ff 100644
--- a/example/ssd/operator/multibox_target-inl.h
+++ b/src/operator/contrib/multibox_target-inl.h
@@ -4,19 +4,20 @@
  * \brief
  * \author Joshua Zhang
 */
-#ifndef MXNET_OPERATOR_MULTIBOX_TARGET_INL_H_
-#define MXNET_OPERATOR_MULTIBOX_TARGET_INL_H_
+#ifndef MXNET_OPERATOR_CONTRIB_MULTIBOX_TARGET_INL_H_
+#define MXNET_OPERATOR_CONTRIB_MULTIBOX_TARGET_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
 #include <mxnet/base.h>
+#include <nnvm/tuple.h>
 #include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include <valarray>
-#include "./operator_common.h"
-#include "./mshadow_op.h"
+#include "../operator_common.h"
+#include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
@@ -37,82 +38,13 @@ enum MultiBoxTargetOpOutputs {kLoc, kLocMask, kCls};
 enum MultiBoxTargetOpResource {kTempSpace};
 }  // namespace mboxtarget_enum
 
-struct VarsInfo {
-  VarsInfo() {}
-  explicit VarsInfo(std::vector<float> in) : info(in) {}
-
-  std::vector<float> info;
-};  // struct VarsInfo
-
-inline std::istream &operator>>(std::istream &is, VarsInfo &size) {
-  while (true) {
-    char ch = is.get();
-    if (ch == '(') break;
-    if (!isspace(ch)) {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  float f;
-  std::vector<float> tmp;
-  // deal with empty case
-  // safe to remove after stop using target_size
-  size_t pos = is.tellg();
-  char ch = is.get();
-  if (ch == ')') {
-    size.info = tmp;
-    return is;
-  }
-  is.seekg(pos);
-  // finish deal
-  while (is >> f) {
-    tmp.push_back(f);
-    char ch;
-    do {
-      ch = is.get();
-    } while (isspace(ch));
-    if (ch == ',') {
-      while (true) {
-        ch = is.peek();
-        if (isspace(ch)) {
-          is.get(); continue;
-        }
-        if (ch == ')') {
-          is.get(); break;
-        }
-        break;
-      }
-      if (ch == ')') break;
-    } else if (ch == ')') {
-      break;
-    } else {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  size.info = tmp;
-  return is;
-}
-
-inline std::ostream &operator<<(std::ostream &os, const VarsInfo &size) {
-  os << '(';
-  for (index_t i = 0; i < size.info.size(); ++i) {
-    if (i != 0) os << ',';
-    os << size.info[i];
-  }
-  // python style tuple
-  if (size.info.size() == 1) os << ',';
-  os << ')';
-  return os;
-}
-
 struct MultiBoxTargetParam : public dmlc::Parameter<MultiBoxTargetParam> {
   float overlap_threshold;
   float ignore_label;
   float negative_mining_ratio;
   float negative_mining_thresh;
   int minimum_negative_samples;
-  VarsInfo variances;
+  nnvm::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxTargetParam) {
     DMLC_DECLARE_FIELD(overlap_threshold).set_default(0.5f)
     .describe("Anchor-GT overlap threshold to be regarded as a possitive match.");
@@ -124,7 +56,7 @@ struct MultiBoxTargetParam : public dmlc::Parameter<MultiBoxTargetParam> {
     .describe("Threshold used for negative mining.");
     DMLC_DECLARE_FIELD(minimum_negative_samples).set_default(0)
     .describe("Minimum number of negative samples.");
-    DMLC_DECLARE_FIELD(variances).set_default(VarsInfo({0.1f, 0.1f, 0.2f, 0.2f}))
+    DMLC_DECLARE_FIELD(variances).set_default({0.1f, 0.1f, 0.2f, 0.2f})
     .describe("Variances to be encoded in box regression target.");
   }
 };  // struct MultiBoxTargetParam
@@ -164,7 +96,7 @@ class MultiBoxTargetOp : public Operator {
     index_t num_batches = labels.size(0);
     index_t num_anchors = anchors.size(0);
     index_t num_labels = labels.size(1);
-    // TODO(Joshua Zhang): use maximum valid ground-truth in batch rather than # in dataset
+    // TODO(zhreshold): use maximum valid ground-truth in batch rather than # in dataset
     Shape<4> temp_shape = Shape4(11, num_batches, num_anchors, num_labels);
     Tensor<xpu, 4, DType> temp_space = ctx.requested[mboxtarget_enum::kTempSpace]
       .get_space_typed<xpu, 4, DType>(temp_shape, s);
@@ -181,7 +113,7 @@ class MultiBoxTargetOp : public Operator {
     CHECK_EQ(temp_space.CheckContiguous(), true);
 
     // compute overlaps
-    // TODO(Joshua Zhang): squeeze temporary memory space
+    // TODO(zhreshold): squeeze temporary memory space
     // temp_space, 0:out, 1:l1, 2:t1, 3:r1, 4:b1, 5:l2, 6:t2, 7:r2, 8:b2
     // 9: intersection, 10:union
     temp_space[1] = broadcast_keepdim(broadcast_with_axis(slice<1>(anchors, 0, 1), -1,
@@ -217,7 +149,7 @@ class MultiBoxTargetOp : public Operator {
                           param_.negative_mining_ratio,
                           param_.negative_mining_thresh,
                           param_.minimum_negative_samples,
-                          param_.variances.info);
+                          param_.variances);
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -271,9 +203,9 @@ class MultiBoxTargetProp : public OperatorProperty {
     CHECK_GT(ashape[1], 0) << "Number boxes should > 0";
     CHECK_EQ(ashape[2], 4) << "Box dimension should be 4: [xmin-ymin-xmax-ymax]";
     TShape lshape = in_shape->at(mboxtarget_enum::kLabel);
-    CHECK_EQ(lshape.ndim(), 3) << "Label should be [batch-num_labels-5] tensor";
+    CHECK_EQ(lshape.ndim(), 3) << "Label should be [batch-num_labels-(>=5)] tensor";
     CHECK_GT(lshape[1], 0) << "Padded label should > 0";
-    CHECK_EQ(lshape[2], 5) << "Label should be [batch-num_labels-5] tensor";
+    CHECK_GE(lshape[2], 5) << "Label width must >=5";
     TShape pshape = in_shape->at(mboxtarget_enum::kClsPred);
     CHECK_EQ(pshape.ndim(), 3) << "Prediction: [nbatch-num_classes-num_anchors]";
     CHECK_EQ(pshape[2], ashape[1]) << "Number of anchors mismatch";
@@ -294,7 +226,7 @@ class MultiBoxTargetProp : public OperatorProperty {
   }
 
   std::string TypeString() const override {
-    return "MultiBoxTarget";
+    return "_contrib_MultiBoxTarget";
   }
 
   //  decalre dependency and inplace optimization options
@@ -326,4 +258,4 @@ class MultiBoxTargetProp : public OperatorProperty {
 }  // namespace op
 }  // namespace mxnet
 
-#endif  // MXNET_OPERATOR_MULTIBOX_TARGET_INL_H_
+#endif  // MXNET_OPERATOR_CONTRIB_MULTIBOX_TARGET_INL_H_
diff --git a/example/ssd/operator/multibox_target.cc b/src/operator/contrib/multibox_target.cc
similarity index 85%
rename from example/ssd/operator/multibox_target.cc
rename to src/operator/contrib/multibox_target.cc
index 4d4b5b729260..15fe433a3aca 100644
--- a/example/ssd/operator/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -4,9 +4,9 @@
  * \brief MultiBoxTarget op
  * \author Joshua Zhang
 */
-#include "./multibox_target-inl.h"
-#include "./mshadow_op.h"
 #include <algorithm>
+#include "./multibox_target-inl.h"
+#include "../mshadow_op.h"
 
 namespace mshadow {
 template<typename DType>
@@ -62,21 +62,23 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
                            const float negative_mining_ratio,
                            const float negative_mining_thresh,
                            const int minimum_negative_samples,
-                           const std::vector<float> &variances) {
+                           const nnvm::Tuple<float> &variances) {
   const DType *p_anchor = anchors.dptr_;
   const int num_batches = labels.size(0);
   const int num_labels = labels.size(1);
+  const int label_width = labels.size(2);
   const int num_anchors = anchors.size(0);
+  CHECK_EQ(variances.ndim(), 4);
   for (int nbatch = 0; nbatch < num_batches; ++nbatch) {
-    const DType *p_label = labels.dptr_ + nbatch * num_labels * 5;
+    const DType *p_label = labels.dptr_ + nbatch * num_labels * label_width;
     const DType *p_overlaps = temp_space.dptr_ + nbatch * num_anchors * num_labels;
     int num_valid_gt = 0;
     for (int i = 0; i < num_labels; ++i) {
-      if (static_cast<float>(*(p_label + i * 5)) == -1.0f) {
-        CHECK_EQ(static_cast<float>(*(p_label + i * 5 + 1)), -1.0f);
-        CHECK_EQ(static_cast<float>(*(p_label + i * 5 + 2)), -1.0f);
-        CHECK_EQ(static_cast<float>(*(p_label + i * 5 + 3)), -1.0f);
-        CHECK_EQ(static_cast<float>(*(p_label + i * 5 + 4)), -1.0f);
+      if (static_cast<float>(*(p_label + i * label_width)) == -1.0f) {
+        CHECK_EQ(static_cast<float>(*(p_label + i * label_width + 1)), -1.0f);
+        CHECK_EQ(static_cast<float>(*(p_label + i * label_width + 2)), -1.0f);
+        CHECK_EQ(static_cast<float>(*(p_label + i * label_width + 3)), -1.0f);
+        CHECK_EQ(static_cast<float>(*(p_label + i * label_width + 4)), -1.0f);
         break;
       }
       ++num_valid_gt;
@@ -134,7 +136,7 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
           }
           const DType *pp_overlaps = p_overlaps + j * num_labels;
           int best_gt = -1;
-          int max_iou = -1.0f;
+          float max_iou = -1.0f;
           for (int k = 0; k < num_valid_gt; ++k) {
             float iou = static_cast<float>(*(pp_overlaps + k));
             if (iou > max_iou) {
@@ -177,7 +179,7 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
               // not yet calculated
               const DType *pp_overlaps = p_overlaps + j * num_labels;
               int best_gt = -1;
-              int max_iou = -1.0f;
+              float max_iou = -1.0f;
               for (int k = 0; k < num_valid_gt; ++k) {
                 float iou = static_cast<float>(*(pp_overlaps + k));
                 if (iou > max_iou) {
@@ -191,24 +193,23 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
                 max_matches[j].first = max_iou;
                 max_matches[j].second = best_gt;
               }
-              if (max_matches[j].first < negative_mining_thresh &&
-                  max_matches[j].first >= 0) {
+            }
+            if (max_matches[j].first < negative_mining_thresh &&
+                anchor_flags[j] == -1) {
                 // calcuate class predictions
-                DType max_val = p_cls_preds[j];
-                DType max_val_pos = p_cls_preds[j + num_anchors];
-                for (int k = 2; k < num_classes; ++k) {
-                  DType tmp = p_cls_preds[j + num_anchors * k];
-                  if (tmp > max_val_pos) max_val_pos = tmp;
-                }
-                if (max_val_pos > max_val) max_val = max_val_pos;
-                DType sum = 0.f;
-                for (int k = 0; k < num_classes; ++k) {
-                  DType tmp = p_cls_preds[j + num_anchors * k];
-                  sum += std::exp(tmp - max_val);
-                }
-                max_val_pos = std::exp(max_val_pos - max_val) / sum;
-                temp.push_back(SortElemDescend(max_val_pos, j));
+              DType max_val = p_cls_preds[j];
+              for (int k = 1; k < num_classes; ++k) {
+                DType tmp = p_cls_preds[j + num_anchors * k];
+                if (tmp > max_val) max_val = tmp;
+              }
+              DType sum = 0.f;
+              for (int k = 0; k < num_classes; ++k) {
+                DType tmp = p_cls_preds[j + num_anchors * k];
+                sum += std::exp(tmp - max_val);
               }
+              DType prob = std::exp(p_cls_preds[j] - max_val) / sum;
+              // loss should be -log(x), but value does not matter, skip log
+              temp.push_back(SortElemDescend(-prob, j));
             }
           }  // end iterate anchors
 
@@ -236,14 +237,14 @@ inline void MultiBoxTargetForward(const Tensor<cpu, 2, DType> &loc_target,
           // positive sample
           CHECK_GE(max_matches[i].second, 0);
           // 0 reserved for background
-          *(p_cls_target + i) = *(p_label + 5 * max_matches[i].second) + 1;
+          *(p_cls_target + i) = *(p_label + label_width * max_matches[i].second) + 1;
           int offset = i * 4;
           *(p_loc_mask + offset) = 1;
           *(p_loc_mask + offset + 1) = 1;
           *(p_loc_mask + offset + 2) = 1;
           *(p_loc_mask + offset + 3) = 1;
           AssignLocTargets(p_anchor + i * 4,
-            p_label + 5 * max_matches[i].second + 1, p_loc_target + offset,
+            p_label + label_width * max_matches[i].second + 1, p_loc_target + offset,
             variances[0], variances[1], variances[2], variances[3]);
         } else if (anchor_flags[i] == 0) {
           // negative sample
@@ -281,7 +282,7 @@ Operator* MultiBoxTargetProp::CreateOperatorEx(Context ctx, std::vector<TShape>
 }
 
 DMLC_REGISTER_PARAMETER(MultiBoxTargetParam);
-MXNET_REGISTER_OP_PROPERTY(MultiBoxTarget, MultiBoxTargetProp)
+MXNET_REGISTER_OP_PROPERTY(_contrib_MultiBoxTarget, MultiBoxTargetProp)
 .describe("Compute Multibox training targets")
 .add_argument("anchor", "Symbol", "Generated anchor boxes.")
 .add_argument("label", "Symbol", "Object detection labels.")
diff --git a/example/ssd/operator/multibox_target.cu b/src/operator/contrib/multibox_target.cu
similarity index 93%
rename from example/ssd/operator/multibox_target.cu
rename to src/operator/contrib/multibox_target.cu
index f96b00498da5..adcfcf249eea 100644
--- a/example/ssd/operator/multibox_target.cu
+++ b/src/operator/contrib/multibox_target.cu
@@ -19,12 +19,13 @@ namespace cuda {
 template<typename DType>
 __global__ void InitGroundTruthFlags(DType *gt_flags, const DType *labels,
                                      const int num_batches,
-                                     const int num_labels) {
+                                     const int num_labels,
+                                     const int label_width) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index >= num_batches * num_labels) return;
   int b = index / num_labels;
   int l = index % num_labels;
-  if (*(labels + b * num_labels * 5 + l * 5) == -1.f) {
+  if (*(labels + b * num_labels * label_width + l * label_width) == -1.f) {
     *(gt_flags + b * num_labels + l) = 0;
   } else {
     *(gt_flags + b * num_labels + l) = 1;
@@ -188,18 +189,16 @@ __global__ void NegativeMining(const DType *overlaps, const DType *cls_preds,
     if (anchor_flags[i] < 0) {
       // compute max class prediction score
       DType max_val = cls_preds[i];
-      DType max_val_pos = cls_preds[i + num_anchors];
-      for (int j = 2; j < num_classes; ++j) {
+      for (int j = 1; j < num_classes; ++j) {
         DType temp = cls_preds[i + num_anchors * j];
-        if (temp > max_val_pos) max_val_pos = temp;
+        if (temp > max_val) max_val = temp;
       }
-      if (max_val_pos > max_val) max_val = max_val_pos;
       DType sum = 0.f;
       for (int j = 0; j < num_classes; ++j) {
         DType temp = cls_preds[i + num_anchors * j];
         sum += exp(temp - max_val);
       }
-      max_val_pos = exp(max_val_pos - max_val) / sum;
+      DType prob = exp(cls_preds[i] - max_val) / sum;
       DType max_iou = -1.f;
       for (int j = 0; j < num_labels; ++j) {
         DType temp = overlaps[i * num_labels + j];
@@ -207,7 +206,7 @@ __global__ void NegativeMining(const DType *overlaps, const DType *cls_preds,
       }
       if (max_iou < negative_mining_thresh) {
         // only do it for anchors with iou < thresh
-        buffer[i] = max_val_pos;
+        buffer[i] = -prob;  // -log(x) actually, but value does not matter
       }
     }
   }
@@ -267,22 +266,22 @@ __global__ void AssignTrainigTargets(DType *loc_target, DType *loc_mask,
                                      DType *cls_target, DType *anchor_flags,
                                      DType *best_matches, DType *labels,
                                      DType *anchors, const int num_anchors,
-                                     const int num_labels, const float vx,
-                                     const float vy, const float vw,
-                                     const float vh) {
+                                     const int num_labels, const int label_width,
+                                     const float vx, const float vy,
+                                     const float vw, const float vh) {
   const int nbatch = blockIdx.x;
   loc_target += nbatch * num_anchors * 4;
   loc_mask += nbatch * num_anchors * 4;
   cls_target += nbatch * num_anchors;
   anchor_flags += nbatch * num_anchors;
   best_matches += nbatch * num_anchors;
-  labels += nbatch * num_labels * 5;
+  labels += nbatch * num_labels * label_width;
   const int num_threads = kMaxThreadsPerBlock;
 
   for (int i = threadIdx.x; i < num_anchors; i += num_threads) {
     if (anchor_flags[i] > 0.5) {
       // positive sample
-      int offset_l = static_cast<int>(best_matches[i]) * 5;
+      int offset_l = static_cast<int>(best_matches[i]) * label_width;
       cls_target[i] = labels[offset_l] + 1;  // 0 reserved for background
       int offset = i * 4;
       loc_mask[offset] = 1;
@@ -331,14 +330,16 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
                            const float negative_mining_ratio,
                            const float negative_mining_thresh,
                            const int minimum_negative_samples,
-                           const std::vector<float> &variances) {
+                           const nnvm::Tuple<float> &variances) {
   const int num_batches = labels.size(0);
   const int num_labels = labels.size(1);
+  const int label_width = labels.size(2);
   const int num_anchors = anchors.size(0);
   const int num_classes = cls_preds.size(1);
   CHECK_GE(num_batches, 1);
   CHECK_GT(num_labels, 2);
   CHECK_GE(num_anchors, 1);
+  CHECK_EQ(variances.ndim(), 4);
 
   // init ground-truth flags, by checking valid labels
   temp_space[1] = 0.f;
@@ -348,7 +349,7 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
   dim3 init_block_dim((num_batches * num_labels - 1) / num_threads + 1);
   cuda::CheckLaunchParam(init_block_dim, init_thread_dim, "MultiBoxTarget Init");
   cuda::InitGroundTruthFlags<DType><<<init_block_dim, init_thread_dim>>>(
-    gt_flags, labels.dptr_, num_batches, num_labels);
+    gt_flags, labels.dptr_, num_batches, num_labels, label_width);
   MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
 
   // compute best matches
@@ -391,7 +392,7 @@ inline void MultiBoxTargetForward(const Tensor<gpu, 2, DType> &loc_target,
   cuda::AssignTrainigTargets<DType><<<num_batches, num_threads>>>(
     loc_target.dptr_, loc_mask.dptr_, cls_target.dptr_, anchor_flags,
     best_matches, labels.dptr_, anchors.dptr_, num_anchors, num_labels,
-    variances[0], variances[1], variances[2], variances[3]);
+    label_width, variances[0], variances[1], variances[2], variances[3]);
   MULTIBOX_TARGET_CUDA_CHECK(cudaPeekAtLastError());
 }
 }  // namespace mshadow
diff --git a/example/rcnn/operator/proposal-inl.h b/src/operator/contrib/proposal-inl.h
similarity index 91%
rename from example/rcnn/operator/proposal-inl.h
rename to src/operator/contrib/proposal-inl.h
index 60bf3593d5de..ed0ec826588f 100644
--- a/example/rcnn/operator/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -4,8 +4,8 @@
  * \brief Proposal Operator
  * \author Piotr Teterwak, Bing Xu, Jian Guo
 */
-#ifndef MXNET_OPERATOR_PROPOSAL_INL_H_
-#define MXNET_OPERATOR_PROPOSAL_INL_H_
+#ifndef MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_
+#define MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_
 
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
@@ -17,9 +17,8 @@
 #include <ctime>
 #include <cstring>
 #include <iostream>
-#include "./operator_common.h"
-#include "./mshadow_op.h"
-#include "./native_op-inl.h"
+#include "../operator_common.h"
+#include "../mshadow_op.h"
 
 // extend NumericalParam
 namespace mxnet {
@@ -108,8 +107,8 @@ inline std::ostream &operator<<(std::ostream &os, const NumericalParam<VType> &p
   return os;
 }
 
-}
-}
+}  // namespace op
+}  // namespace mxnet
 
 namespace mxnet {
 namespace op {
@@ -200,7 +199,7 @@ class ProposalProp : public OperatorProperty {
   }
 
   std::string TypeString() const override {
-    return "Proposal";
+    return "_contrib_Proposal";
   }
 
   std::vector<ResourceRequest> ForwardResource(
@@ -218,8 +217,7 @@ class ProposalProp : public OperatorProperty {
   int NumVisibleOutputs() const override {
     if (param_.output_score) {
       return 2;
-    }
-    else{
+    } else {
       return 1;
     }
   }
@@ -257,18 +255,18 @@ inline void _MakeAnchor(float w,
                         float h,
                         float x_ctr,
                         float y_ctr,
-                        std::vector<float>& out_anchors) {
-  out_anchors.push_back(x_ctr - 0.5f * (w - 1.0f));
-  out_anchors.push_back(y_ctr - 0.5f * (h - 1.0f));
-  out_anchors.push_back(x_ctr + 0.5f * (w - 1.0f));
-  out_anchors.push_back(y_ctr + 0.5f * (h - 1.0f));
-  out_anchors.push_back(0.0f);
+                        std::vector<float> *out_anchors) {
+  out_anchors->push_back(x_ctr - 0.5f * (w - 1.0f));
+  out_anchors->push_back(y_ctr - 0.5f * (h - 1.0f));
+  out_anchors->push_back(x_ctr + 0.5f * (w - 1.0f));
+  out_anchors->push_back(y_ctr + 0.5f * (h - 1.0f));
+  out_anchors->push_back(0.0f);
 }
 
 inline void _Transform(float scale,
                        float ratio,
                        const std::vector<float>& base_anchor,
-                       std::vector<float>& out_anchors) {
+                       std::vector<float>  *out_anchors) {
   float w = base_anchor[2] - base_anchor[1] + 1.0f;
   float h = base_anchor[3] - base_anchor[1] + 1.0f;
   float x_ctr = base_anchor[0] + 0.5 * (w - 1.0f);
@@ -286,7 +284,7 @@ inline void _Transform(float scale,
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
                             const std::vector<float>& ratios,
                             const std::vector<float>& scales,
-                            std::vector<float>& out_anchors) {
+                            std::vector<float> *out_anchors) {
   for (size_t j = 0; j < ratios.size(); ++j) {
     for (size_t k = 0; k < scales.size(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
@@ -298,4 +296,4 @@ inline void GenerateAnchors(const std::vector<float>& base_anchor,
 }  // namespace op
 }  // namespace mxnet
 
-#endif  //  MXNET_OPERATOR_PROPOSAL_INL_H_
+#endif  //  MXNET_OPERATOR_CONTRIB_PROPOSAL_INL_H_
diff --git a/example/rcnn/operator/proposal.cc b/src/operator/contrib/proposal.cc
similarity index 85%
rename from example/rcnn/operator/proposal.cc
rename to src/operator/contrib/proposal.cc
index 6ffb59dc792d..46209af5545d 100644
--- a/example/rcnn/operator/proposal.cc
+++ b/src/operator/contrib/proposal.cc
@@ -123,17 +123,17 @@ inline void IoUTransformInv(const mshadow::Tensor<cpu, 2>& boxes,
 
 // filter box by set confidence to zero
 // * height or width < rpn_min_size
-inline void FilterBox(mshadow::Tensor<cpu, 2>& dets,
+inline void FilterBox(mshadow::Tensor<cpu, 2> *dets,
                       const float min_size) {
-  for (index_t i = 0; i < dets.size(0); i++) {
-    float iw = dets[i][2] - dets[i][0] + 1.0f;
-    float ih = dets[i][3] - dets[i][1] + 1.0f;
+  for (index_t i = 0; i < dets->size(0); i++) {
+    float iw = (*dets)[i][2] - (*dets)[i][0] + 1.0f;
+    float ih = (*dets)[i][3] - (*dets)[i][1] + 1.0f;
     if (iw < min_size || ih < min_size) {
-      dets[i][0] -= min_size / 2;
-      dets[i][1] -= min_size / 2;
-      dets[i][2] += min_size / 2;
-      dets[i][3] += min_size / 2;
-      dets[i][4] = -1.0f;
+      (*dets)[i][0] -= min_size / 2;
+      (*dets)[i][1] -= min_size / 2;
+      (*dets)[i][2] += min_size / 2;
+      (*dets)[i][3] += min_size / 2;
+      (*dets)[i][4] = -1.0f;
     }
   }
 }
@@ -161,19 +161,19 @@ struct ReverseArgsortCompl {
 
 // copy score and init order
 inline void CopyScore(const mshadow::Tensor<cpu, 2>& dets,
-                      mshadow::Tensor<cpu, 1>& score,
-                      mshadow::Tensor<cpu, 1>& order) {
+                      mshadow::Tensor<cpu, 1> *score,
+                      mshadow::Tensor<cpu, 1> *order) {
   for (index_t i = 0; i < dets.size(0); i++) {
-    score[i] = dets[i][4];
-    order[i] = i;
+    (*score)[i] = dets[i][4];
+    (*order)[i] = i;
   }
 }
 
 // sort order array according to score
 inline void ReverseArgsort(const mshadow::Tensor<cpu, 1>& score,
-                           mshadow::Tensor<cpu, 1>& order) {
+                           mshadow::Tensor<cpu, 1> *order) {
   ReverseArgsortCompl cmpl(score.dptr_);
-  std::sort(order.dptr_, order.dptr_ + score.size(0), cmpl);
+  std::sort(order->dptr_, order->dptr_ + score.size(0), cmpl);
 }
 
 // reorder proposals according to order and keep the pre_nms_top_n proposals
@@ -181,12 +181,12 @@ inline void ReverseArgsort(const mshadow::Tensor<cpu, 1>& score,
 inline void ReorderProposals(const mshadow::Tensor<cpu, 2>& prev_dets,
                              const mshadow::Tensor<cpu, 1>& order,
                              const index_t pre_nms_top_n,
-                             mshadow::Tensor<cpu, 2>& dets) {
-  CHECK_EQ(dets.size(0), pre_nms_top_n);
-  for (index_t i = 0; i < dets.size(0); i++) {
+                             mshadow::Tensor<cpu, 2> *dets) {
+  CHECK_EQ(dets->size(0), pre_nms_top_n);
+  for (index_t i = 0; i < dets->size(0); i++) {
     const index_t index = order[i];
-    for (index_t j = 0; j < dets.size(1); j++) {
-      dets[i][j] = prev_dets[index][j];
+    for (index_t j = 0; j < dets->size(1); j++) {
+      (*dets)[i][j] = prev_dets[index][j];
     }
   }
 }
@@ -195,20 +195,20 @@ inline void ReorderProposals(const mshadow::Tensor<cpu, 2>& prev_dets,
 inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets,
                                   const float thresh,
                                   const index_t post_nms_top_n,
-                                  mshadow::Tensor<cpu, 1>& area,
-                                  mshadow::Tensor<cpu, 1>& suppressed,
-                                  mshadow::Tensor<cpu, 1>& keep,
+                                  mshadow::Tensor<cpu, 1> *area,
+                                  mshadow::Tensor<cpu, 1> *suppressed,
+                                  mshadow::Tensor<cpu, 1> *keep,
                                   index_t *out_size) {
   CHECK_EQ(dets.shape_[1], 5) << "dets: [x1, y1, x2, y2, score]";
   CHECK_GT(dets.shape_[0], 0);
   CHECK_EQ(dets.CheckContiguous(), true);
-  CHECK_EQ(area.CheckContiguous(), true);
-  CHECK_EQ(suppressed.CheckContiguous(), true);
-  CHECK_EQ(keep.CheckContiguous(), true);
+  CHECK_EQ(area->CheckContiguous(), true);
+  CHECK_EQ(suppressed->CheckContiguous(), true);
+  CHECK_EQ(keep->CheckContiguous(), true);
   // calculate area
   for (index_t i = 0; i < dets.size(0); ++i) {
-    area[i] = (dets[i][2] - dets[i][0] + 1) *
-              (dets[i][3] - dets[i][1] + 1);
+    (*area)[i] = (dets[i][2] - dets[i][0] + 1) *
+                 (dets[i][3] - dets[i][1] + 1);
   }
 
   // calculate nms
@@ -218,15 +218,15 @@ inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets,
     float iy1 = dets[i][1];
     float ix2 = dets[i][2];
     float iy2 = dets[i][3];
-    float iarea = area[i];
+    float iarea = (*area)[i];
 
-    if (suppressed[i] > 0.0f ) {
+    if ((*suppressed)[i] > 0.0f) {
       continue;
     }
 
-    keep[(*out_size)++] = i;
+    (*keep)[(*out_size)++] = i;
     for (index_t j = i + 1; j < dets.size(0); j ++) {
-      if (suppressed[j] > 0.0f) {
+      if ((*suppressed)[j] > 0.0f) {
         continue;
       }
       float xx1 = std::max(ix1, dets[j][0]);
@@ -236,9 +236,9 @@ inline void NonMaximumSuppression(const mshadow::Tensor<cpu, 2>& dets,
       float w = std::max(0.0f, xx2 - xx1 + 1.0f);
       float h = std::max(0.0f, yy2 - yy1 + 1.0f);
       float inter = w * h;
-      float ovr = inter / (iarea + area[j] - inter);
+      float ovr = inter / (iarea + (*area)[j] - inter);
       if (ovr > thresh) {
-        suppressed[j] = 1.0f;
+        (*suppressed)[j] = 1.0f;
       }
     }
   }
@@ -270,7 +270,8 @@ class ProposalOp : public Operator{
     CHECK_EQ(out_data.size(), 2);
     CHECK_GT(req.size(), 1);
     CHECK_EQ(req[proposal::kOut], kWriteTo);
-    CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1) << "Sorry, multiple images each device is not implemented.";
+    CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1)
+      << "Sorry, multiple images each device is not implemented.";
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
@@ -278,7 +279,8 @@ class ProposalOp : public Operator{
                                    in_data[proposal::kClsProb].shape_[1] / 2,
                                    in_data[proposal::kClsProb].shape_[2],
                                    in_data[proposal::kClsProb].shape_[3]);
-    real_t* foreground_score_ptr = reinterpret_cast<real_t *>(in_data[proposal::kClsProb].dptr_) + scores_shape.Size();
+    real_t* foreground_score_ptr = reinterpret_cast<real_t *>(in_data[proposal::kClsProb].dptr_)
+                                    + scores_shape.Size();
     Tensor<cpu, 4> scores = Tensor<cpu, 4>(foreground_score_ptr, scores_shape);
     Tensor<cpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<cpu, 4, real_t>(s);
     Tensor<cpu, 2> im_info = in_data[proposal::kImInfo].get<cpu, 2, real_t>(s);
@@ -302,7 +304,8 @@ class ProposalOp : public Operator{
     start += count * 5;
     Tensor<cpu, 2> workspace_pre_nms(workspace.dptr_ + start, Shape2(2, count));
     start += 2 * count;
-    Tensor<cpu, 2> workspace_ordered_proposals(workspace.dptr_ + start, Shape2(rpn_pre_nms_top_n, 5));
+    Tensor<cpu, 2> workspace_ordered_proposals(workspace.dptr_ + start,
+                                               Shape2(rpn_pre_nms_top_n, 5));
     start += rpn_pre_nms_top_n * 5;
     Tensor<cpu, 2> workspace_nms(workspace.dptr_ + start, Shape2(3, rpn_pre_nms_top_n));
     start += 3 * rpn_pre_nms_top_n;
@@ -319,13 +322,13 @@ class ProposalOp : public Operator{
     utils::GenerateAnchors(base_anchor,
                            param_.ratios.info,
                            param_.scales.info,
-                           anchors);
+                           &anchors);
     std::memcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size());
 
-    //Enumerate all shifted anchors
-    for (index_t i = 0; i < num_anchors; ++i){
-      for (index_t j = 0; j < height; ++j){
-        for (index_t k = 0; k < width; ++k){
+    // Enumerate all shifted anchors
+    for (index_t i = 0; i < num_anchors; ++i) {
+      for (index_t j = 0; j < height; ++j) {
+        for (index_t k = 0; k < width; ++k) {
           index_t index = j * (width * num_anchors) + k * (num_anchors) + i;
           workspace_proposals[index][0] = workspace_proposals[i][0] + k * param_.feature_stride;
           workspace_proposals[index][1] = workspace_proposals[i][1] + j * param_.feature_stride;
@@ -349,20 +352,20 @@ class ProposalOp : public Operator{
       utils::BBoxTransformInv(workspace_proposals, bbox_deltas, im_info[0][0], im_info[0][1],
                               real_height, real_width, &(workspace_proposals));
     }
-    utils::FilterBox(workspace_proposals, param_.rpn_min_size * im_info[0][2]);
+    utils::FilterBox(&workspace_proposals, param_.rpn_min_size * im_info[0][2]);
 
     Tensor<cpu, 1> score = workspace_pre_nms[0];
     Tensor<cpu, 1> order = workspace_pre_nms[1];
 
     utils::CopyScore(workspace_proposals,
-                     score,
-                     order);
+                     &score,
+                     &order);
     utils::ReverseArgsort(score,
-                          order);
+                          &order);
     utils::ReorderProposals(workspace_proposals,
                             order,
                             rpn_pre_nms_top_n,
-                            workspace_ordered_proposals);
+                            &workspace_ordered_proposals);
 
     index_t out_size = 0;
     Tensor<cpu, 1> area = workspace_nms[0];
@@ -373,14 +376,14 @@ class ProposalOp : public Operator{
     utils::NonMaximumSuppression(workspace_ordered_proposals,
                                  param_.threshold,
                                  rpn_post_nms_top_n,
-                                 area,
-                                 suppressed,
-                                 keep,
+                                 &area,
+                                 &suppressed,
+                                 &keep,
                                  &out_size);
 
     // fill in output rois
     for (index_t i = 0; i < out.size(0); ++i) {
-      //batch index 0
+      // batch index 0
       out[i][0] = 0;
       if (i < out_size) {
         index_t index = keep[i];
@@ -400,8 +403,7 @@ class ProposalOp : public Operator{
       if (i < out_size) {
         index_t index = keep[i];
         out_score[i][0] = workspace_ordered_proposals[index][4];
-      }
-      else {
+      } else {
         index_t index = keep[i % out_size];
         out_score[i][0] = workspace_ordered_proposals[index][4];
       }
@@ -445,7 +447,7 @@ Operator* ProposalProp::CreateOperator(Context ctx) const {
 
 DMLC_REGISTER_PARAMETER(ProposalParam);
 
-MXNET_REGISTER_OP_PROPERTY(Proposal, ProposalProp)
+MXNET_REGISTER_OP_PROPERTY(_contrib_Proposal, ProposalProp)
 .describe("Generate region proposals via RPN")
 .add_argument("cls_score", "Symbol", "Score of how likely proposal is object.")
 .add_argument("bbox_pred", "Symbol", "BBox Predicted deltas from anchors for proposals")
diff --git a/example/rcnn/operator/proposal.cu b/src/operator/contrib/proposal.cu
similarity index 92%
rename from example/rcnn/operator/proposal.cu
rename to src/operator/contrib/proposal.cu
index 0ad14b763f09..cbbff96e1d8d 100644
--- a/example/rcnn/operator/proposal.cu
+++ b/src/operator/contrib/proposal.cu
@@ -7,23 +7,24 @@
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
+#include <mshadow/tensor.h>
+#include <mshadow/cuda/reduce.cuh>
+#include <thrust/sort.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+
 #include <map>
 #include <vector>
 #include <string>
 #include <utility>
 #include <ctime>
 #include <iostream>
-#include <mshadow/tensor.h>
-#include <mshadow/cuda/reduce.cuh>
-#include <thrust/sort.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include "./operator_common.h"
-#include "./mshadow_op.h"
-#include "./native_op-inl.h"
+
+#include "../operator_common.h"
+#include "../mshadow_op.h"
 #include "./proposal-inl.h"
 
-#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 
 #define FRCNN_CUDA_CHECK(condition) \
   /* Code block avoids redefinition of cudaError_t error */ \
@@ -32,8 +33,8 @@
     CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
 } while (0)
 
-namespace mshadow{
-namespace cuda{
+namespace mshadow {
+namespace cuda {
 
 // scores are (b, anchor, h, w)
 // workspace_proposals are (h * w * anchor, 5)
@@ -239,8 +240,8 @@ __device__ inline float devIoU(float const * const a, float const * const b) {
 }
 
 __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
-                           const float *dev_boxes, unsigned long long *dev_mask) {
-  const int threadsPerBlock = sizeof(unsigned long long) * 8;
+                           const float *dev_boxes, uint64_t *dev_mask) {
+  const int threadsPerBlock = sizeof(uint64_t) * 8;
   const int row_start = blockIdx.y;
   const int col_start = blockIdx.x;
 
@@ -270,7 +271,7 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
     const float *cur_box = dev_boxes + cur_box_idx * 5;
     int i = 0;
-    unsigned long long t = 0;
+    uint64_t t = 0;
     int start = 0;
     if (row_start == col_start) {
       start = threadIdx.x + 1;
@@ -289,16 +290,16 @@ void _nms(const mshadow::Tensor<gpu, 2>& boxes,
           const float nms_overlap_thresh,
           int *keep,
           int *num_out) {
-  const int threadsPerBlock = sizeof(unsigned long long) * 8;
+  const int threadsPerBlock = sizeof(uint64_t) * 8;
   const int boxes_num = boxes.size(0);
   const int boxes_dim = boxes.size(1);
 
   float* boxes_dev = boxes.dptr_;
-  unsigned long long* mask_dev = NULL;
+  uint64_t* mask_dev = NULL;
 
   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
   FRCNN_CUDA_CHECK(cudaMalloc(&mask_dev,
-                              boxes_num * col_blocks * sizeof(unsigned long long)));
+                              boxes_num * col_blocks * sizeof(uint64_t)));
 
   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
               DIVUP(boxes_num, threadsPerBlock));
@@ -308,14 +309,14 @@ void _nms(const mshadow::Tensor<gpu, 2>& boxes,
                                   boxes_dev,
                                   mask_dev);
   FRCNN_CUDA_CHECK(cudaPeekAtLastError());
-  std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
+  std::vector<uint64_t> mask_host(boxes_num * col_blocks);
   FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0],
                               mask_dev,
-                              sizeof(unsigned long long) * boxes_num * col_blocks,
+                              sizeof(uint64_t) * boxes_num * col_blocks,
                               cudaMemcpyDeviceToHost));
 
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
 
   int num_to_keep = 0;
   for (int i = 0; i < boxes_num; i++) {
@@ -324,7 +325,7 @@ void _nms(const mshadow::Tensor<gpu, 2>& boxes,
 
     if (!(remv[nblock] & (1ULL << inblock))) {
       keep[num_to_keep++] = i;
-      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      uint64_t *p = &mask_host[0] + i * col_blocks;
       for (int j = nblock; j < col_blocks; j++) {
         remv[j] |= p[j];
       }
@@ -365,8 +366,8 @@ __global__ void PrepareOutput(const int count,
   }
 }
 
-}
-}
+}  // namespace cuda
+}  // namespace mshadow
 
 namespace mxnet {
 namespace op {
@@ -390,7 +391,8 @@ class ProposalGPUOp : public Operator{
     CHECK_EQ(out_data.size(), 2);
     CHECK_GT(req.size(), 1);
     CHECK_EQ(req[proposal::kOut], kWriteTo);
-    CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1) << "Sorry, multiple images each device is not implemented.";
+    CHECK_EQ(in_data[proposal::kClsProb].shape_[0], 1)
+      << "Sorry, multiple images each device is not implemented.";
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
@@ -398,7 +400,8 @@ class ProposalGPUOp : public Operator{
                                       in_data[proposal::kClsProb].shape_[1] / 2,
                                       in_data[proposal::kClsProb].shape_[2],
                                       in_data[proposal::kClsProb].shape_[3]);
-    real_t* foreground_score_ptr = reinterpret_cast<real_t *>(in_data[proposal::kClsProb].dptr_) + fg_scores_shape.Size();
+    real_t* foreground_score_ptr = reinterpret_cast<real_t *>(in_data[proposal::kClsProb].dptr_)
+                                    + fg_scores_shape.Size();
     Tensor<xpu, 4> scores = Tensor<xpu, 4>(foreground_score_ptr, fg_scores_shape);
     Tensor<xpu, 4> bbox_deltas = in_data[proposal::kBBoxPred].get<xpu, 4, real_t>(s);
     Tensor<xpu, 2> im_info = in_data[proposal::kImInfo].get<xpu, 2, real_t>(s);
@@ -410,7 +413,8 @@ class ProposalGPUOp : public Operator{
     int height = scores.size(2);
     int width = scores.size(3);
     int count = num_anchors * height * width;  // count of total anchors
-    int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count;  // set to -1 for max
+    // set to -1 for max
+    int rpn_pre_nms_top_n = (param_.rpn_pre_nms_top_n > 0) ? param_.rpn_pre_nms_top_n : count;
     rpn_pre_nms_top_n = std::min(rpn_pre_nms_top_n, count);
     int rpn_post_nms_top_n = std::min(param_.rpn_post_nms_top_n, rpn_pre_nms_top_n);
 
@@ -425,13 +429,14 @@ class ProposalGPUOp : public Operator{
     utils::GenerateAnchors(base_anchor,
                            param_.ratios.info,
                            param_.scales.info,
-                           anchors);
+                           &anchors);
 
     // Copy generated anchors to GPU
     float* workspace_proposals_ptr = NULL;
     FRCNN_CUDA_CHECK(cudaMalloc(&workspace_proposals_ptr, sizeof(float) * count * 5));
     Tensor<xpu, 2> workspace_proposals(workspace_proposals_ptr, Shape2(count, 5));
-    FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, &anchors[0], sizeof(float) * anchors.size(),
+    FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_,
+                                &anchors[0], sizeof(float) * anchors.size(),
       cudaMemcpyHostToDevice));
 
     // Copy proposals to a mesh grid
@@ -445,7 +450,9 @@ class ProposalGPUOp : public Operator{
 
     // im_info is small, we want to copy them to cpu
     std::vector<float> cpu_im_info(3);
-    FRCNN_CUDA_CHECK(cudaMemcpy(&cpu_im_info[0], im_info.dptr_, sizeof(float) * cpu_im_info.size(), cudaMemcpyDeviceToHost));
+    FRCNN_CUDA_CHECK(cudaMemcpy(&cpu_im_info[0], im_info.dptr_,
+                                sizeof(float) * cpu_im_info.size(),
+                                cudaMemcpyDeviceToHost));
 
     // prevent padded predictions
     int real_height = static_cast<int>(cpu_im_info[0] / param_.feature_stride);
@@ -497,8 +504,10 @@ class ProposalGPUOp : public Operator{
 
     // Reorder proposals according to order
     float* workspace_ordered_proposals_ptr = NULL;
-    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr, sizeof(float) * rpn_pre_nms_top_n * 5));
-    Tensor<xpu, 2> workspace_ordered_proposals(workspace_ordered_proposals_ptr, Shape2(rpn_pre_nms_top_n, 5));
+    FRCNN_CUDA_CHECK(cudaMalloc(&workspace_ordered_proposals_ptr,
+                                sizeof(float) * rpn_pre_nms_top_n * 5));
+    Tensor<xpu, 2> workspace_ordered_proposals(workspace_ordered_proposals_ptr,
+                                               Shape2(rpn_pre_nms_top_n, 5));
 
     dimGrid.x = (rpn_pre_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
     CheckLaunchParam(dimGrid, dimBlock, "ReorderProposals");
@@ -521,7 +530,8 @@ class ProposalGPUOp : public Operator{
     // copy nms result to gpu
     int* keep;
     FRCNN_CUDA_CHECK(cudaMalloc(&keep, sizeof(int) * _keep.size()));
-    FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(), cudaMemcpyHostToDevice));
+    FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(),
+                                cudaMemcpyHostToDevice));
 
     // copy results after nms
     dimGrid.x = (rpn_post_nms_top_n + kMaxThreadsPerBlock - 1) / kMaxThreadsPerBlock;
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 28c86d1bbd2c..756f351987c5 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -1,8 +1,9 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file convolution-inl.h
  * \brief
- * \author Bing Xu
+ * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
+ * \author Bing Xu, Jun Wu
 */
 #ifndef MXNET_OPERATOR_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_CONVOLUTION_INL_H_
@@ -11,6 +12,7 @@
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
 #include <dmlc/logging.h>
 #include <dmlc/optional.h>
 #include <algorithm>
@@ -19,6 +21,7 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./nn/im2col.h"
 
 
 namespace mxnet {
@@ -54,10 +57,9 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
     .describe("convolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
-    .describe("Number of group partitions. Equivalent to slicing input into num_group\n    "
-              "partitions, apply convolution on each, then concatenate the results");
+    .describe("Number of group partitions.");
     DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192)
-    .describe("Maximum tmp workspace allowed for convolution (MB).");
+    .describe("Maximum temperal workspace allowed for convolution (MB).");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
     .describe("Whether to disable bias parameter.");
     DMLC_DECLARE_FIELD(cudnn_tune)
@@ -65,25 +67,18 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
     .add_enum("limited_workspace", conv::kLimited)
     .add_enum("fastest", conv::kFastest)
     .set_default(dmlc::optional<int>())
-    .describe("Whether to pick convolution algo by running performance test.\n    "
-              "Leads to higher startup time but may give faster speed. Options are:\n    "
-              "\'off\': no tuning\n    "
-              "\'limited_workspace\': run test and pick the fastest algorithm "
-              "that doesn't exceed workspace limit.\n    "
-              "\'fastest\': pick the fastest algorithm and ignore workspace limit.\n    "
-              "If set to None (default), behavior is determined by environment\n    "
-              "variable MXNET_CUDNN_AUTOTUNE_DEFAULT: 0 for off,\n    "
-              "1 for limited workspace (default), 2 for fastest.");
+        .describe("Whether to pick convolution algo by running performance test.");
     DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
     .describe("Turn off cudnn for this layer.");
     DMLC_DECLARE_FIELD(layout)
+    .add_enum("NCW", mshadow::kNCW)
     .add_enum("NCHW", mshadow::kNCHW)
-    .add_enum("NHWC", mshadow::kNHWC)
     .add_enum("NCDHW", mshadow::kNCDHW)
+    .add_enum("NHWC", mshadow::kNHWC)
     .add_enum("NDHWC", mshadow::kNDHWC)
     .set_default(dmlc::optional<int>())
     .describe("Set layout for input, output and weight. Empty for\n    "
-              "default layout: NCHW for 2d and NCDHW for 3d.");
+              "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
   }
 };
 
@@ -94,9 +89,10 @@ class ConvolutionOp : public Operator {
     this->param_ = p;
     // convert MBytes first to Bytes and then to elements.
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    CHECK(param_.layout.value() == mshadow::kNCHW ||
+    CHECK(param_.layout.value() == mshadow::kNCW ||
+          param_.layout.value() == mshadow::kNCHW ||
           param_.layout.value() == mshadow::kNCDHW)
-      << "Only support NCHW and NCDHW layout";
+      << "Only support NCW, NCHW and NCDHW layout";
   }
 
   virtual void Forward(const OpContext &ctx,
@@ -109,224 +105,182 @@ class ConvolutionOp : public Operator {
     CHECK_EQ(req[conv::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req[conv::kOut], kWriteTo);
+    LayerSetUp(in_data[conv::kData].shape_, out_data[conv::kOut].shape_);
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    // allocate workspace for col_buffer
+    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+    // calculate the shape of col_buffer
+    TShape col_buffer_shape(num_spatial_axes_ + 1);
+    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = out_data[0].shape_[i+1];
     }
-    Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
-    Shape<3> wmat_shape =
-        Shape3(param_.num_group,
-               param_.num_filter / param_.num_group,
-               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-        in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-    Tensor<xpu, 4, DType> out = out_data[conv::kOut].get<xpu, 4, DType>(s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
-    const index_t nbatch = data.size(0);
-    Tensor<xpu, 1, DType> workspace =
-        ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(this->InitTemp(data.shape_, out.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
-      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                               Shape2(shape_colunit_[0],
-                                                      shape_colunit_[1] * step), s);
-      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                               workspace.dptr_ + temp_col.shape_.Size(),
-                                               Shape3(shape_dstunit_[0],
-                                                      shape_dstunit_[1],
-                                                      shape_dstunit_[2] * step), s);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        temp_col = unpack_patch2col(data.Slice(i, i + step),
-                                    param_.kernel[0],
-                                    param_.kernel[1],
-                                    param_.stride[0],
-                                    param_.stride[1],
-                                    param_.dilate[0],
-                                    param_.dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(data.Slice(i, i + step),
-                                    param_.pad[0], param_.pad[1]),
-                                    param_.kernel[0],
-                                    param_.kernel[1],
-                                    param_.stride[0],
-                                    param_.stride[1],
-                                    param_.dilate[0],
-                                    param_.dilate[1]);
-      }
+    // create a column buffer using workspace and col_buffer_shape
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
 
-      const index_t gstride = temp_col.size(0) / param_.num_group;
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
-                                       gstride * (gid + 1));
-        temp_dst[gid] = dot(wmat[gid], tmpc);
+    // initialize weight and col_buffer 3D tensors for using gemm
+    index_t M = conv_out_channels_ / group_;
+    index_t N = conv_out_spatial_dim_;
+    index_t K = kernel_dim_;
+    Tensor<xpu, 3, DType> weight_3d = in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(group_, M, K), s);
+    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+      Shape3(group_, K, N), s);
+    Tensor<xpu, 4, DType> output_4d = out_data[conv::kOut].get_with_shape<xpu, 4, DType>(
+      Shape4(num_, group_, M, N), s);
+    for (index_t n = 0; n < num_; ++n) {
+      // transform image to col_buffer in order to use gemm
+      im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
+             col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
+             col_buffer.dptr<DType>());
+      Tensor<xpu, 3, DType> output_3d = output_4d[n];
+      for (index_t g = 0; g < group_; ++g) {
+        ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
       }
-      out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
-                                              mshadow::Shape4(param_.num_filter,
-                                                  step,
-                                                  out.size(2),
-                                                  out.size(3))));
     }
-    if (!param_.no_bias) {
-      // add bias, broadcast bias to dim 1: channel
+    if (bias_term_) {
       Tensor<xpu, 1, DType> bias = in_data[conv::kBias].get<xpu, 1, DType>(s);
-      out += broadcast<1>(bias, out.shape_);
+      Tensor<xpu, 3, DType> output_3d = out_data[conv::kOut].get_with_shape<xpu, 3, DType>(
+        Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
+      // has bias term, broadcast it to the same shape of output_3d in channel dim
+      output_3d += mshadow::expr::broadcast<1>(bias, output_3d.shape_);
     }
   }
 
   virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
-    }
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
-    // get data
+    LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data = in_data[conv::kData].get<xpu, 4, DType>(s);
-    Shape<3> wmat_shape =
-        Shape3(param_.num_group,
-               param_.num_filter / param_.num_group,
-               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-        in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-    Tensor<xpu, 4, DType> grad = out_grad[conv::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> gdata = in_grad[conv::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 3, DType> gwmat =
-        in_grad[conv::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
-    const index_t nbatch = data.size(0);
-    Tensor<xpu, 1, DType> workspace =
-        ctx.requested[conv::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(this->InitTemp(data.shape_, grad.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
-      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                               Shape2(shape_colunit_[0],
-                                                      shape_colunit_[1] * step), s);
-      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                               workspace.dptr_ + temp_col.shape_.Size(),
-                                               Shape3(shape_dstunit_[0],
-                                                      shape_dstunit_[1],
-                                                      shape_dstunit_[2] * step), s);
-      temp_dst = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst.shape_);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        temp_col = unpack_patch2col(data.Slice(i, i + step),
-                                     param_.kernel[0],
-                                     param_.kernel[1],
-                                     param_.stride[0],
-                                     param_.stride[1],
-                                     param_.dilate[0],
-                                     param_.dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
-                                     param_.kernel[0],
-                                     param_.kernel[1],
-                                     param_.stride[0],
-                                     param_.stride[1],
-                                     param_.dilate[0],
-                                     param_.dilate[1]);
+    // allocate workspace for col_buffer
+    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+    // calculate the shape of col_buffer
+    TShape col_buffer_shape(num_spatial_axes_ + 1);
+    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = out_grad[conv::kData].shape_[i+1];
+    }
+    // create a column buffer using workspace and col_buffer_shape
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
+    // initialize weight and col_buffer 3D tensors for using gemm
+    // For computing dLoss/d(in_data[kData])
+    index_t M = kernel_dim_;
+    index_t N = conv_out_spatial_dim_;
+    index_t K = conv_out_channels_ / group_;
+    Tensor<xpu, 3, DType> weight_3d = in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(group_, K, M), s);
+    Tensor<xpu, 4, DType> out_grad_4d = out_grad[conv::kOut].get_with_shape<xpu, 4, DType>(
+      Shape4(num_, group_, K, N), s);
+    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+      Shape3(group_, M, N), s);
+    // For computing dLoss/dWeight
+    Tensor<xpu, 3, DType> dweight_3d = in_grad[conv::kWeight].get_with_shape<xpu, 3, DType>(
+      Shape3(group_, K, M), s);
+
+    for (index_t n = 0; n < num_; ++n) {
+      Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
+      // gradient w.r.t. input data
+      for (index_t g = 0; g < group_; ++g) {
+        col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]);
       }
-      const index_t gstride = temp_col.size(0) / param_.num_group;
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-        if (i == 0) {
-          Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-          Assign(tmp_gwmat, req[conv::kWeight], dot(temp_dst[gid], tmpc.T()));
+      col2im(s, col_buffer.dptr<DType>(), in_grad[conv::kData].shape_, col_buffer.shape_,
+             param_.kernel, param_.pad, param_.stride, param_.dilate,
+             in_grad[conv::kData].dptr<DType>()+n*input_dim_, req[conv::kData]);
+
+      // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+      im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
+             col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
+             col_buffer.dptr<DType>());
+      for (index_t g = 0; g < group_; ++g) {
+        if (0 == n) {
+          ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight],
+                          dot(out_grad_3d[g], col_buffer_3d[g].T()));
         } else {
-          gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T());
         }
       }
-
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-        tmpc = dot(wmat[gid].T(), temp_dst[gid]);
-      }
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        Assign(gdata.Slice(i, i + step), req[conv::kData],
-               pack_col2patch(temp_col,
-                              data.Slice(i, i + step).shape_,
-                              param_.kernel[0],
-                              param_.kernel[1],
-                              param_.stride[0],
-                              param_.stride[1],
-                              param_.dilate[0],
-                              param_.dilate[1]));
-      } else {
-        Shape<4> pshape = data.Slice(i, i + step).shape_;
-        pshape[2] += 2 * param_.pad[0];
-        pshape[3] += 2 * param_.pad[1];
-        Assign(gdata.Slice(i, i + step), req[conv::kData],
-               crop(pack_col2patch(temp_col,
-                                   pshape,
-                                   param_.kernel[0],
-                                   param_.kernel[1],
-                                   param_.stride[0],
-                                   param_.stride[1],
-                                   param_.dilate[0],
-                                   param_.dilate[1]),
-                    gdata[i][0].shape_));
-      }
     }
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> gbias = in_grad[conv::kBias].get<xpu, 1, DType>(s);
-      Assign(gbias, req[conv::kBias], sumall_except_dim<1>(grad));
+
+    // gradient w.r.t bias
+    if (bias_term_) {
+      Tensor<xpu, 1, DType> dbias = in_grad[conv::kBias].get<xpu, 1, DType>(s);
+      Tensor<xpu, 3, DType> dout = out_grad[conv::kOut].get_with_shape<xpu, 3, DType>(
+          Shape3(num_, conv_out_channels_, conv_out_spatial_dim_), s);
+      ASSIGN_DISPATCH(dbias, req[conv::kBias], sumall_except_dim<1>(dout));
     }
   }
 
  private:
-  inline index_t InitTemp(const mshadow::Shape<4> &ishape,
-                          const mshadow::Shape<4> &oshape) {
-    const int ksize_y = param_.kernel[0];
-    const int ksize_x = param_.kernel[1];
-    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
-                                     oshape[2] * oshape[3]);
-    shape_dstunit_ = mshadow::Shape3(param_.num_group,
-                                     param_.num_filter / param_.num_group,
-                                     oshape[2] * oshape[3]);
-    // param_.workspace is in elements of sizeof(DType)
-    // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
-    nstep_ = std::max(
-        std::min(
-            static_cast<index_t>(
-                param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
-            ishape[0]),
-        1U);
-
-    mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-                                             shape_colunit_[1] * nstep_);
-    mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
-                                             shape_dstunit_[1],
-                                             shape_dstunit_[2] * nstep_);
-    index_t required_size = scol.Size() + sdst.Size();
-    CHECK_GE(param_.workspace, required_size)
-      << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n"
-      << "Given: " << param_.workspace * sizeof(DType) << " Bytes";
-    return required_size;
+  void LayerSetUp(const TShape& ishape, const TShape& oshape) {
+    channel_axis_ = 1;  // hard code channel axis
+    const index_t first_spatial_axis = channel_axis_ + 1;
+    const index_t num_axes = param_.kernel.ndim() + 2;
+    num_spatial_axes_ = num_axes - first_spatial_axis;
+    is_1x1_ = true;
+    for (index_t i = 0; i < param_.kernel.ndim(); ++i) {
+      is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
+      if (!is_1x1_) break;
+    }
+
+    // batch size
+    num_ = ishape[0];
+    // number of input channels
+    channels_ = ishape[1];
+    group_ = param_.num_group;
+    conv_out_channels_ = param_.num_filter;
+    conv_in_channels_ = channels_;
+    bias_term_ = !param_.no_bias;
+    kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
+    weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
+    conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
+    col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
+    output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+    // size of the column buffer used for storing im2col-ed pixels
+    col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_;
+    // input/output image size (#channels * height * width)
+    input_dim_ = ishape.ProdShape(1, ishape.ndim());
+    output_dim_ = oshape.ProdShape(1, oshape.ndim());
+    num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
+    num_kernels_col2im_ = input_dim_;
   }
 
+ private:
   ConvolutionParam param_;
-  mshadow::Shape<2> shape_colunit_;
-  mshadow::Shape<3> shape_dstunit_;
-  index_t nstep_;
+  index_t channel_axis_;  // channel axis of the input
+  index_t channels_;  // number of channels of input image
+  index_t num_spatial_axes_;  // number of spatial axes
+  index_t num_;  // batch size
+  index_t group_;  // number of groups
+  index_t conv_out_channels_;  // number of output channels (num_filter)
+  index_t conv_out_spatial_dim_;  // number of pixels of output images per channel
+  index_t conv_in_channels_;  // number of input channels
+  index_t kernel_dim_;  // number of input channels per group * kernel size
+  index_t weight_offset_;  // number of output channels per group * kernel_dim_
+  index_t col_offset_;
+  index_t output_offset_;
+  index_t col_buffer_size_;
+  index_t input_dim_;
+  index_t output_dim_;
+  index_t num_kernels_im2col_;
+  index_t num_kernels_col2im_;
+  bool bias_term_;  // has bias term?
+  bool is_1x1_;
 };  // class ConvolutionOp
 
 template<typename xpu>
@@ -349,13 +303,18 @@ class ConvolutionProp : public OperatorProperty {
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
     using namespace mshadow;
     param_.Init(kwargs);
-    if (param_.kernel.ndim() == 2) {
+    if (param_.kernel.ndim() == 1) {
+      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
       param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
       if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
       if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
       if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
     } else {
-      CHECK_EQ(param_.kernel.ndim(), 3) << param_.kernel.ndim() << "D convolution not supported";
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
       param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
       if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
       if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
@@ -372,17 +331,50 @@ class ConvolutionProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
     } else {
-      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
     }
     // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
     out_shape->resize(1, TShape());
     const TShape &dshp = (*in_shape)[conv::kData];
     if (dshp.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 2) {
+    if (param_.kernel.ndim() == 1) {
+      // 1d conv
+      CHECK_EQ(dshp.ndim(), 3) << "Input data should be 3D in batch-num_filter-x";
+      Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW);
+      Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+                               param_.kernel[0]);
+      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[0]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0) \
+          << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+          << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_x <= dshape[2] + 2 * param_.pad[0])
+          << "kernel size exceed input";
+      Shape<3> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = (dshape[2] + 2 * param_.pad[0] -
+          (param_.dilate[0] * (ksize_x - 1) + 1)) / param_.stride[0] + 1;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+      return true;
+    } else if (param_.kernel.ndim() == 2) {
       // 2d conv
-      CHECK_EQ(dshp.ndim(), 4) \
+      CHECK_EQ(dshp.ndim(), 4U) \
           << "Input data should be 4D in batch-num_filter-y-x";
       Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
       Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
@@ -396,19 +388,16 @@ class ConvolutionProp : public OperatorProperty {
 
       const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
       const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-      CHECK_EQ(dshape[1] % param_.num_group, 0) \
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
           << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
           << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0) \
+      CHECK_GT(param_.kernel.Size(), 0U) \
           << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0) \
+      CHECK_GT(param_.stride.Size(), 0U) \
           << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0) \
+      CHECK_GT(param_.dilate.Size(), 0U) \
           << "incorrect dilate size: " << param_.dilate;
-      CHECK(ksize_y <= dshape[2] + 2 * param_.pad[0]
-            && ksize_x <= dshape[3] + 2 * param_.pad[1])
-          << "kernel size exceed input";
       Shape<4> oshape;
       oshape[0] = dshape[0];
       oshape[1] = param_.num_filter;
@@ -417,10 +406,30 @@ class ConvolutionProp : public OperatorProperty {
       oshape[3] = (dshape[3] + 2 * param_.pad[1] -
           (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
       SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
+      dshape[0] = oshape[0];
+      if (param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0];
+      }
+      if (param_.stride[1] == 1) {
+        dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+                          ConvertLayout(dshape, kNCHW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input";
+      }
+      if (dshape[3] != 0) {
+        CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input";
+      }
       return true;
     } else if (param_.kernel.ndim() == 3) {
       // 3d conv
-      CHECK_EQ(dshp.ndim(), 5) \
+      CHECK_EQ(dshp.ndim(), 5U) \
         << "Input data should be 5D in batch-num_filter-depth-y-x";
       Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
       Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
@@ -435,21 +444,17 @@ class ConvolutionProp : public OperatorProperty {
       const index_t ksize_d = static_cast<index_t>(param_.kernel[0]);
       const index_t ksize_y = static_cast<index_t>(param_.kernel[1]);
       const index_t ksize_x = static_cast<index_t>(param_.kernel[2]);
-      CHECK_EQ(dshape[1] % param_.num_group, 0)
+      CHECK_EQ(dshape[1] % param_.num_group, 0U)
         << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0)
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U)
         << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0) \
+      CHECK_GT(param_.kernel.Size(), 0U) \
         << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0) \
+      CHECK_GT(param_.stride.Size(), 0U) \
         << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0) \
+      CHECK_GT(param_.dilate.Size(), 0U) \
         << "incorrect dilate size: " << param_.dilate;
-      CHECK(ksize_d <= dshape[2] + 2 * param_.pad[0]
-            && ksize_y <= dshape[3] + 2 * param_.pad[1]
-            && ksize_x <= dshape[4] + 2 * param_.pad[2])
-        << "kernel size exceed input";
-      CHECK_EQ(param_.dilate.Size(), 1)
+      CHECK_EQ(param_.dilate.Size(), 1U)
         << "Dilate is not supported in 3d convolution";
       Shape<5> oshape;
       oshape[0] = dshape[0];
@@ -461,6 +466,32 @@ class ConvolutionProp : public OperatorProperty {
       oshape[4] = (dshape[4] + 2 * param_.pad[2] -
           (1 * (ksize_x - 1) + 1)) / param_.stride[2] + 1;
       SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
+      dshape[0] = oshape[0];
+      if (param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + 1 * (ksize_d - 1) - 2 * param_.pad[0];
+      }
+      if (param_.stride[1] == 1) {
+        dshape[3] = oshape[3] + 1 * (ksize_y - 1) - 2 * param_.pad[1];
+      }
+      if (param_.stride[2] == 1) {
+        dshape[4] = oshape[4] + 1 * (ksize_x - 1) - 2 * param_.pad[2];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+                          ConvertLayout(dshape, kNCDHW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(ksize_d, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input";
+      }
+      if (dshape[3] != 0) {
+        CHECK_LE(ksize_y, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input";
+      }
+      if (dshape[4] != 0) {
+        CHECK_LE(ksize_x, dshape[4] + 2 * param_.pad[2]) << "kernel size exceed input";
+      }
       return true;
     } else {
       LOG(FATAL) << "Unknown convolution type";
@@ -471,7 +502,7 @@ class ConvolutionProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
+    CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index e2a71d59950e..73cbfe676f0c 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file convolution.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 
 #include "./convolution-inl.h"
@@ -25,6 +25,13 @@ Operator* CreateOp<cpu>(ConvolutionParam param, int dtype,
                         std::vector<TShape> *out_shape,
                         Context ctx) {
   Operator *op = NULL;
+  // If 1D convolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new ConvolutionOp<cpu, DType>(param);
+    })
+    return op;
+  }
 #if MXNET_USE_MKL2017 == 1
   if ((param.dilate[0] == 1 && param.dilate[1] == 1)
       && param.kernel.ndim() == 2) {
@@ -72,11 +79,83 @@ Operator *ConvolutionProp::CreateOperatorEx(Context ctx,
 }
 
 MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp)
-.add_argument("data", "Symbol", "Input data to the ConvolutionOp.")
-.add_argument("weight", "Symbol", "Weight matrix.")
-.add_argument("bias", "Symbol", "Bias parameter.")
-.add_arguments(ConvolutionParam::__FIELDS__())
-.describe("Apply convolution to input then add a bias.");
+.describe(R"code(Compute *N*-D convolution on *(N+2)*-D input.
+
+In the 2-D convolution, given input data with shape *(batch_size,
+channel, height, width)*, the output is computed by
+
+.. math::
+
+   out[n,i,:,:] = bias[i] + \sum_{j=0}^{num\_filter} data[n,j,:,:] \star
+   weight[i,j,:,:]
+
+where :math:`\star` is the 2-D cross-correlation operator.
+
+For general 2-D convolution, the shapes are
+
+- **data**: *(batch_size, channel, height, width)*
+- **weight**: *(num_filter, channel, kernel[0], kernel[1])*
+- **bias**: *(num_filter,)*
+- **out**: *(batch_size, num_filter, out_height, out_width)*.
+
+Define::
+
+  f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1
+
+then we have::
+
+  out_height=f(height, kernel[0], pad[0], stride[0], dilate[0])
+  out_width=f(width, kernel[1], pad[1], stride[1], dilate[1])
+
+If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+
+The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height,
+width)*. We can choose other layouts such as *NHWC*.
+
+If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data``
+evenly into *g* parts along the channel axis, and also evenly split ``weight``
+along the first dimension. Next compute the convolution on the *i*-th part of
+the data with the *i*-th weight part. The output is obtained by concating all
+the *g* results.
+
+1-D convolution does not have *height* dimension but only *width* in space.
+
+- **data**: *(batch_size, channel, width)*
+- **weight**: *(num_filter, channel, kernel[0])*
+- **bias**: *(num_filter,)*
+- **out**: *(batch_size, num_filter, out_width)*.
+
+3-D convolution adds an additional *depth* dimension besides *height* and
+*width*. The shapes are
+
+- **data**: *(batch_size, channel, depth, height, width)*
+- **weight**: *(num_filter, channel, kernel[0], kernel[1], kernel[2])*
+- **bias**: *(num_filter,)*
+- **out**: *(batch_size, num_filter, out_depth, out_height, out_width)*.
+
+Both ``weight`` and ``bias`` are learnable parameters.
+
+There are other options to tune the performance.
+
+- **cudnn_tune**: enable this option leads to higher startup time but may give
+  faster speed. Options are
+
+  - **off**: no tuning
+  - **limited_workspace**:run test and pick the fastest algorithm that doesn't
+    exceed workspace limit.
+  - **fastest**: pick the fastest algorithm and ignore workspace limit.
+  - **None** (default): the behavior is determined by environment variable
+    ``MXNET_CUDNN_AUTOTUNE_DEFAULT``. 0 for off, 1 for limited workspace
+    (default), 2 for fastest.
+
+- **workspace**: A large number leads to more (GPU) memory usage but may improve
+  the performance.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to the ConvolutionOp.")
+.add_argument("weight", "ndarray-or-symbol", "Weight matrix.")
+.add_argument("bias", "ndarray-or-symbol", "Bias parameter.")
+.add_arguments(ConvolutionParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index 4f2312dc7649..4f8e4ebf1ff4 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file convolution.cu
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 
 #include "./convolution-inl.h"
@@ -13,12 +13,20 @@
 
 namespace mxnet {
 namespace op {
+
 template<>
 Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
                         std::vector<TShape> *in_shape,
                         std::vector<TShape> *out_shape,
                         Context ctx) {
   Operator *op = NULL;
+  // If 1D convolution, use MXNet implementation
+  if (param.kernel.ndim() == 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new ConvolutionOp<gpu, DType>(param);
+    })
+    return op;
+  }
 #if MXNET_USE_CUDNN == 1
   if (param.dilate.Size() == 1 && !param.cudnn_off) {
     MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
new file mode 100644
index 000000000000..ee8c8c0462b3
--- /dev/null
+++ b/src/operator/convolution_v1-inl.h
@@ -0,0 +1,531 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file convolution_v1-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_CONVOLUTION_V1_INL_H_
+#define MXNET_OPERATOR_CONVOLUTION_V1_INL_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace conv_v1 {
+enum ConvolutionV1OpInputs {kData, kWeight, kBias};
+enum ConvolutionV1OpOutputs {kOut};
+enum ConvolutionV1OpResource {kTempSpace};
+enum ConvolutionV1OpCudnnTune {kOff, kLimited, kFastest};
+}
+
+struct ConvolutionV1Param : public dmlc::Parameter<ConvolutionV1Param> {
+  TShape kernel;
+  TShape stride;
+  TShape dilate;
+  TShape pad;
+  uint32_t num_filter;
+  uint32_t num_group;
+  uint64_t workspace;
+  bool no_bias;
+  dmlc::optional<int> cudnn_tune;
+  bool cudnn_off;
+  dmlc::optional<int> layout;
+  DMLC_DECLARE_PARAMETER(ConvolutionV1Param) {
+    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
+    .describe("convolution stride: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(dilate).set_default(TShape())
+    .describe("convolution dilate: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
+    .describe("pad for convolution: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
+    .describe("convolution filter(channel) number");
+    DMLC_DECLARE_FIELD(num_group).set_default(1)
+    .describe("Number of group partitions. Equivalent to slicing input into num_group\n    "
+              "partitions, apply convolution on each, then concatenate the results");
+    DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192)
+    .describe("Maximum tmp workspace allowed for convolution (MB).");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+    .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(cudnn_tune)
+    .add_enum("off", conv_v1::kOff)
+    .add_enum("limited_workspace", conv_v1::kLimited)
+    .add_enum("fastest", conv_v1::kFastest)
+    .set_default(dmlc::optional<int>())
+    .describe("Whether to pick convolution algo by running performance test.\n    "
+              "Leads to higher startup time but may give faster speed. Options are:\n    "
+              "\'off\': no tuning\n    "
+              "\'limited_workspace\': run test and pick the fastest algorithm "
+              "that doesn't exceed workspace limit.\n    "
+              "\'fastest\': pick the fastest algorithm and ignore workspace limit.\n    "
+              "If set to None (default), behavior is determined by environment\n    "
+              "variable MXNET_CUDNN_AUTOTUNE_DEFAULT: 0 for off,\n    "
+              "1 for limited workspace (default), 2 for fastest.");
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
+    .describe("Turn off cudnn for this layer.");
+    DMLC_DECLARE_FIELD(layout)
+    .add_enum("NCHW", mshadow::kNCHW)
+    .add_enum("NHWC", mshadow::kNHWC)
+    .add_enum("NCDHW", mshadow::kNCDHW)
+    .add_enum("NDHWC", mshadow::kNDHWC)
+    .set_default(dmlc::optional<int>())
+    .describe("Set layout for input, output and weight. Empty for\n    "
+              "default layout: NCHW for 2d and NCDHW for 3d.");
+  }
+};
+
+template<typename xpu, typename DType>
+class ConvolutionV1Op : public Operator {
+ public:
+  explicit ConvolutionV1Op(ConvolutionV1Param p) {
+    this->param_ = p;
+    // convert MBytes first to Bytes and then to elements.
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    CHECK(param_.layout.value() == mshadow::kNCHW ||
+          param_.layout.value() == mshadow::kNCDHW)
+      << "Only support NCHW and NCDHW layout";
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[conv_v1::kOut], kWriteTo);
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1U);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
+    Tensor<xpu, 4, DType> data = in_data[conv_v1::kData].get<xpu, 4, DType>(s);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               param_.num_filter / param_.num_group,
+               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3, DType> wmat =
+        in_data[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+    Tensor<xpu, 4, DType> out = out_data[conv_v1::kOut].get<xpu, 4, DType>(s);
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
+    const index_t nbatch = data.size(0);
+    Tensor<xpu, 1, DType> workspace =
+        ctx.requested[conv_v1::kTempSpace].get_space_typed<xpu, 1, DType>(
+            Shape1(this->InitTemp(data.shape_, out.shape_)), s);
+    for (index_t i = 0; i < nbatch; i += nstep_) {
+      const index_t step = std::min(nstep_, nbatch - i);
+      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                               Shape2(shape_colunit_[0],
+                                                      shape_colunit_[1] * step), s);
+      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
+                                               workspace.dptr_ + temp_col.shape_.Size(),
+                                               Shape3(shape_dstunit_[0],
+                                                      shape_dstunit_[1],
+                                                      shape_dstunit_[2] * step), s);
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        temp_col = unpack_patch2col(data.Slice(i, i + step),
+                                    param_.kernel[0],
+                                    param_.kernel[1],
+                                    param_.stride[0],
+                                    param_.stride[1],
+                                    param_.dilate[0],
+                                    param_.dilate[1]);
+      } else {
+        temp_col = unpack_patch2col(pad(data.Slice(i, i + step),
+                                    param_.pad[0], param_.pad[1]),
+                                    param_.kernel[0],
+                                    param_.kernel[1],
+                                    param_.stride[0],
+                                    param_.stride[1],
+                                    param_.dilate[0],
+                                    param_.dilate[1]);
+      }
+
+      const index_t gstride = temp_col.size(0) / param_.num_group;
+      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+        mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
+                                       gstride * (gid + 1));
+        temp_dst[gid] = dot(wmat[gid], tmpc);
+      }
+      out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
+                                              mshadow::Shape4(param_.num_filter,
+                                                  step,
+                                                  out.size(2),
+                                                  out.size(3))));
+    }
+    if (!param_.no_bias) {
+      // add bias, broadcast bias to dim 1: channel
+      Tensor<xpu, 1, DType> bias = in_data[conv_v1::kBias].get<xpu, 1, DType>(s);
+      out += broadcast<1>(bias, out.shape_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(bing): check the BLAS Handle, be careful
+    if (param_.kernel.ndim() > 2) {
+      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
+    }
+    CHECK_EQ(out_grad.size(), 1);
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    CHECK_EQ(in_data[conv_v1::kWeight].CheckContiguous(), true);
+    // get data
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> data = in_data[conv_v1::kData].get<xpu, 4, DType>(s);
+    Shape<3> wmat_shape =
+        Shape3(param_.num_group,
+               param_.num_filter / param_.num_group,
+               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
+    Tensor<xpu, 3, DType> wmat =
+        in_data[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+    Tensor<xpu, 4, DType> grad = out_grad[conv_v1::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> gdata = in_grad[conv_v1::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 3, DType> gwmat =
+        in_grad[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
+    const index_t nbatch = data.size(0);
+    Tensor<xpu, 1, DType> workspace =
+        ctx.requested[conv_v1::kTempSpace].get_space_typed<xpu, 1, DType>(
+            Shape1(this->InitTemp(data.shape_, grad.shape_)), s);
+    for (index_t i = 0; i < nbatch; i += nstep_) {
+      const index_t step = std::min(nstep_, nbatch - i);
+      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
+                                               Shape2(shape_colunit_[0],
+                                                      shape_colunit_[1] * step), s);
+      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
+                                               workspace.dptr_ + temp_col.shape_.Size(),
+                                               Shape3(shape_dstunit_[0],
+                                                      shape_dstunit_[1],
+                                                      shape_dstunit_[2] * step), s);
+      temp_dst = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst.shape_);
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        temp_col = unpack_patch2col(data.Slice(i, i + step),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
+                                     param_.stride[0],
+                                     param_.stride[1],
+                                     param_.dilate[0],
+                                     param_.dilate[1]);
+      } else {
+        temp_col = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
+                                     param_.stride[0],
+                                     param_.stride[1],
+                                     param_.dilate[0],
+                                     param_.dilate[1]);
+      }
+      const index_t gstride = temp_col.size(0) / param_.num_group;
+      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+        if (i == 0) {
+          Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
+          Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T()));
+        } else {
+          gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+        }
+      }
+
+      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
+        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
+        tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+      }
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
+               pack_col2patch(temp_col,
+                              data.Slice(i, i + step).shape_,
+                              param_.kernel[0],
+                              param_.kernel[1],
+                              param_.stride[0],
+                              param_.stride[1],
+                              param_.dilate[0],
+                              param_.dilate[1]));
+      } else {
+        Shape<4> pshape = data.Slice(i, i + step).shape_;
+        pshape[2] += 2 * param_.pad[0];
+        pshape[3] += 2 * param_.pad[1];
+        Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
+               crop(pack_col2patch(temp_col,
+                                   pshape,
+                                   param_.kernel[0],
+                                   param_.kernel[1],
+                                   param_.stride[0],
+                                   param_.stride[1],
+                                   param_.dilate[0],
+                                   param_.dilate[1]),
+                    gdata[i][0].shape_));
+      }
+    }
+    if (!param_.no_bias) {
+      Tensor<xpu, 1, DType> gbias = in_grad[conv_v1::kBias].get<xpu, 1, DType>(s);
+      Assign(gbias, req[conv_v1::kBias], sumall_except_dim<1>(grad));
+    }
+  }
+
+ private:
+  inline index_t InitTemp(const mshadow::Shape<4> &ishape,
+                          const mshadow::Shape<4> &oshape) {
+    const int ksize_y = param_.kernel[0];
+    const int ksize_x = param_.kernel[1];
+    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
+                                     oshape[2] * oshape[3]);
+    shape_dstunit_ = mshadow::Shape3(param_.num_group,
+                                     param_.num_filter / param_.num_group,
+                                     oshape[2] * oshape[3]);
+    // param_.workspace is in elements of sizeof(DType)
+    // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
+    nstep_ = std::max(
+        std::min(
+            static_cast<index_t>(
+                param_.workspace / (shape_colunit_.Size() + shape_dstunit_.Size())),
+            ishape[0]),
+        1U);
+
+    mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
+                                             shape_colunit_[1] * nstep_);
+    mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
+                                             shape_dstunit_[1],
+                                             shape_dstunit_[2] * nstep_);
+    index_t required_size = scol.Size() + sdst.Size();
+    CHECK_GE(param_.workspace, required_size)
+      << "\nMinimum workspace size: " << required_size * sizeof(DType) << " Bytes\n"
+      << "Given: " << param_.workspace * sizeof(DType) << " Bytes";
+    return required_size;
+  }
+
+  ConvolutionV1Param param_;
+  mshadow::Shape<2> shape_colunit_;
+  mshadow::Shape<3> shape_dstunit_;
+  index_t nstep_;
+};  // class ConvolutionV1Op
+
+template<typename xpu>
+Operator* CreateOp(ConvolutionV1Param param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
+
+#if DMLC_USE_CXX11
+class ConvolutionV1Prop : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    if (!param_.no_bias) {
+      return {"data", "weight", "bias"};
+    } else {
+      return {"data", "weight"};
+    }
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 2) {
+      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
+      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+    }
+    // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
+    out_shape->resize(1, TShape());
+    const TShape &dshp = (*in_shape)[conv_v1::kData];
+    if (dshp.ndim() ==  0) return false;
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv_v1
+      CHECK_EQ(dshp.ndim(), 4U) \
+          << "Input data should be 4D in batch-num_filter-y-x";
+      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+                               param_.kernel[0], param_.kernel[1]);
+      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0) \
+          << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+          << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+          << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+          << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+          << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_y <= dshape[2] + 2 * param_.pad[0]
+            && ksize_x <= dshape[3] + 2 * param_.pad[1])
+          << "kernel size exceed input";
+      Shape<4> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = (dshape[2] + 2 * param_.pad[0] -
+          (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
+      oshape[3] = (dshape[3] + 2 * param_.pad[1] -
+          (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+      return true;
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv_v1
+      CHECK_EQ(dshp.ndim(), 5U) \
+        << "Input data should be 5D in batch-num_filter-depth-y-x";
+      Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
+      Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+      wshape[0] *= param_.num_group;
+      SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_d = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[1]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[2]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0)
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0)
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0) \
+        << "incorrect dilate size: " << param_.dilate;
+      CHECK(ksize_d <= dshape[2] + 2 * param_.pad[0]
+            && ksize_y <= dshape[3] + 2 * param_.pad[1]
+            && ksize_x <= dshape[4] + 2 * param_.pad[2])
+        << "kernel size exceed input";
+      CHECK_EQ(param_.dilate.Size(), 1U)
+        << "Dilate is not supported in 3d convolution";
+      Shape<5> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = (dshape[2] + 2 * param_.pad[0] -
+          (1 * (ksize_d - 1) + 1)) / param_.stride[0] + 1;
+      oshape[3] = (dshape[3] + 2 * param_.pad[1] -
+          (1 * (ksize_y - 1) + 1)) / param_.stride[1] + 1;
+      oshape[4] = (dshape[4] + 2 * param_.pad[2] -
+          (1 * (ksize_x - 1) + 1)) / param_.stride[2] + 1;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+      return true;
+    } else {
+      LOG(FATAL) << "Unknown convolution type";
+      return false;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new ConvolutionV1Prop();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "Convolution_v1";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[conv_v1::kOut], in_data[conv_v1::kData], in_data[conv_v1::kWeight]};
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  ConvolutionV1Param param_;
+};  // class ConvolutionV1Prop
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONVOLUTION_V1_INL_H_
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
new file mode 100644
index 000000000000..dcee996a767e
--- /dev/null
+++ b/src/operator/convolution_v1.cc
@@ -0,0 +1,54 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file convolution_v1.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./convolution_v1-inl.h"
+#if MXNET_USE_MKL2017 == 1
+#include <mkl_memory.h>
+#include "./mkl/mkl_memory-inl.h"
+#include "./mkl/mkl_convolution-inl.h"
+#endif  // MXNET_USE_MKL2017
+#if MXNET_USE_NNPACK == 1
+#include "./nnpack/nnpack_convolution-inl.h"
+#endif  // MXNET_USE_NNPACK
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(ConvolutionV1Param);
+
+template<>
+Operator* CreateOp<cpu>(ConvolutionV1Param param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConvolutionV1Op<cpu, DType>(param);
+  })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *ConvolutionV1Prop::CreateOperatorEx(Context ctx,
+                                              std::vector<TShape> *in_shape,
+                                              std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+}
+
+MXNET_REGISTER_OP_PROPERTY(Convolution_v1, ConvolutionV1Prop)
+.add_argument("data", "Symbol", "Input data to the ConvolutionV1Op.")
+.add_argument("weight", "Symbol", "Weight matrix.")
+.add_argument("bias", "Symbol", "Bias parameter.")
+.add_arguments(ConvolutionV1Param::__FIELDS__())
+.describe("This operator is DEPRECATED."
+          " Apply convolution to input then add a bias.");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/convolution_v1.cu b/src/operator/convolution_v1.cu
new file mode 100644
index 000000000000..83a0f1d0f7df
--- /dev/null
+++ b/src/operator/convolution_v1.cu
@@ -0,0 +1,30 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file convolution_v1.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./convolution_v1-inl.h"
+#include <vector>
+#if MXNET_USE_CUDNN == 1
+#include "./cudnn_convolution-inl.h"
+#endif  // MXNET_USE_CUDNN
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(ConvolutionV1Param param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new ConvolutionV1Op<gpu, DType>(param);
+  })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/correlation-inl.h b/src/operator/correlation-inl.h
index f0e509d67cc5..6ba209bfb28e 100644
--- a/src/operator/correlation-inl.h
+++ b/src/operator/correlation-inl.h
@@ -57,8 +57,8 @@ class CorrelationOp : public Operator {
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 3U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data1 = in_data[Correlation::kData1].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> data2 = in_data[Correlation::kData2].get<xpu, 4, real_t>(s);
@@ -170,11 +170,11 @@ void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) overr
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data1, data2]";
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data1, data2]";
     TShape dshape1 = in_shape->at(Correlation::kData1);
     TShape dshape2 = in_shape->at(Correlation::kData2);
-    CHECK_EQ(dshape1.ndim(), 4) << "data should be a 4D tensor";
-    CHECK_EQ(dshape2.ndim(), 4) << "data should be a 4D tensor";
+    CHECK_EQ(dshape1.ndim(), 4U) << "data should be a 4D tensor";
+    CHECK_EQ(dshape2.ndim(), 4U) << "data should be a 4D tensor";
     int paddedbottomheight;
     int paddedbottomwidth;
     uint32_t kernel_radius_;
@@ -199,9 +199,9 @@ void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) overr
     neighborhood_grid_radius_ = param_.max_displacement / stride2;
     neighborhood_grid_width_ = neighborhood_grid_radius_ * 2 + 1;
     top_channels_ = neighborhood_grid_width_ * neighborhood_grid_width_;
-    CHECK_GE(top_width_, 1) <<
+    CHECK_GE(top_width_, 1U) <<
     "Correlation cannot be done with current settings.Neighborhood and kernel don't fit in blob";
-    CHECK_GE(top_height_, 1) <<
+    CHECK_GE(top_height_, 1U) <<
     "Correlation cannot be done with current settings.Neighborhood and kernel don't fit in blob";
     out_shape->clear();
     out_shape->push_back(Shape4(dshape1[0], top_channels_, top_height_, top_width_));
diff --git a/src/operator/correlation.cc b/src/operator/correlation.cc
index bcd829fff117..c637fd537690 100644
--- a/src/operator/correlation.cc
+++ b/src/operator/correlation.cc
@@ -29,13 +29,13 @@ inline void CorrelationForward(const Tensor<cpu, 4, Dtype> &out,
                                int max_displacement_, int kernel_size_,
                                int neighborhood_grid_radius_, int neighborhood_grid_width_,
                                int  kernel_radius_, int stride1_, int stride2_) {
-  const int bnum = data1.size(0);
+  const index_t bnum = data1.size(0);
   const int bchannels = data1.size(1);
   const int sumelems = kernel_size_ * kernel_size_ * bchannels;
   AddPad<Dtype>(data1, tmp1, pad_size_);
   AddPad<Dtype>(data2, tmp2, pad_size_);
-  for (index_t i = 0 ; i < top_height_ ; i++)
-      for (index_t j = 0 ; j < top_width_; j++)
+  for (index_t i = 0 ; i < static_cast<index_t>(top_height_) ; i++)
+      for (index_t j = 0 ; j < static_cast<index_t>(top_width_); j++)
         for (index_t nbatch = 0 ; nbatch < bnum ; nbatch++) {
             int x1 = j*stride1_+max_displacement_;
             int y1 = i*stride1_+max_displacement_;
@@ -76,9 +76,9 @@ inline void CorrelationBackward(const Tensor<cpu, 4, Dtype> &out_grad,
                                 int channels, int height, int width
                             ) {
   const float sumelems = kernel_size_ * kernel_size_ * channels;
-  for (int i = 0 ; i < top_height_ ; i++)
-     for (int j = 0 ; j < top_width_; j++)
-        for (int nbatch = 0 ; nbatch < num ; nbatch++) {
+  for (int i = 0 ; i < static_cast<index_t>(top_height_) ; i++)
+     for (int j = 0 ; j < static_cast<index_t>(top_width_); j++)
+        for (int nbatch = 0 ; nbatch < static_cast<index_t>(num) ; nbatch++) {
             int x1 = j*stride1_+max_displacement_;
             int y1 = i*stride1_+max_displacement_;
             for (int top_channel = 0 ; top_channel < top_channels_ ; top_channel++) {
diff --git a/src/operator/crop-inl.h b/src/operator/crop-inl.h
index 52b2bc2fe7b3..5cc4a8926c7a 100644
--- a/src/operator/crop-inl.h
+++ b/src/operator/crop-inl.h
@@ -60,7 +60,7 @@ class CropOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(static_cast<int>(in_data.size()), param_.num_args);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     CHECK_EQ(req[crop_enum::kOut], kWriteTo);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data = in_data[crop_enum::kData].get<xpu, 4, real_t>(s);
@@ -82,7 +82,7 @@ class CropOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_grad.size(), static_cast<size_t>(param_.num_args)) << in_grad.size();
-    CHECK_EQ(out_grad.size(), 1) << out_grad.size();
+    CHECK_EQ(out_grad.size(), 1U) << out_grad.size();
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> grad = out_grad[crop_enum::kOut].get<xpu, 4, real_t>(s);
     Tensor<xpu, 4> gdata = in_grad[crop_enum::kData].get<xpu, 4, real_t>(s);
@@ -145,7 +145,7 @@ class CropProp : public OperatorProperty {
     // return {"data", "crop_like"};
     std::vector<std::string> ret;
     for (int i = 0; i < param_.num_args; ++i) {
-      ret.push_back(std::string("arg") + static_cast<char>('0' + i));
+      ret.push_back(std::string("arg") + std::to_string(i));
     }
     return ret;
   }
@@ -157,7 +157,7 @@ class CropProp : public OperatorProperty {
     CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
     TShape data_shape = in_shape->at(crop_enum::kData);
     if (data_shape.ndim() == 0) return false;
-    CHECK_EQ(data_shape.ndim(), 4) << \
+    CHECK_EQ(data_shape.ndim(), 4U) << \
         "Input data should be 4D in batch-num_filter-y-x";
     std::vector<int> crop_shape;
     if (param_.num_args == 1) {
@@ -177,7 +177,7 @@ class CropProp : public OperatorProperty {
       crop_shape.push_back(crop_like_shape[3]);
     }
     if (crop_shape.size() == 0) return false;
-    CHECK_EQ(crop_shape.size(), 2) << \
+    CHECK_EQ(crop_shape.size(), 2U) << \
         "Input crop_like should be 2D in height-width";
     out_shape->clear();
     data_shape[2] = crop_shape[0];
diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
index f3322252bd42..1f56d537461a 100644
--- a/src/operator/cudnn_activation-inl.h
+++ b/src/operator/cudnn_activation-inl.h
@@ -59,8 +59,8 @@ class CuDNNActivationOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data;
     Tensor<gpu, 4, DType> out;
@@ -128,11 +128,11 @@ class CuDNNActivationOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     typename DataType<DType>::ScaleType alpha = 1.0f;
     typename DataType<DType>::ScaleType beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
diff --git a/src/operator/cudnn_algoreg-inl.h b/src/operator/cudnn_algoreg-inl.h
new file mode 100644
index 000000000000..3778125d7e2a
--- /dev/null
+++ b/src/operator/cudnn_algoreg-inl.h
@@ -0,0 +1,89 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file cudnn_algoreg-inl.h
+ * \brief
+ * \author Bing Xu
+ */
+#ifndef MXNET_OPERATOR_CUDNN_ALGOREG_INL_H_
+#define MXNET_OPERATOR_CUDNN_ALGOREG_INL_H_
+
+#include <algorithm>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "../common/cuda_utils.h"
+#include "./convolution-inl.h"
+#include "./deconvolution-inl.h"
+
+namespace mxnet {
+namespace op {
+#if MXNET_USE_CUDNN == 1
+
+class CuDNNAlgoReg {
+ public:
+  template <typename Param>
+  std::string GetKey(const Param &param, const std::vector<TShape> &in_shape,
+                     const std::vector<TShape> &out_shape) {
+    std::ostringstream oss;
+    for (auto &i : in_shape)
+      oss << i << ";";
+    for (auto &i : out_shape)
+      oss << i << ";";
+    auto dict = param.__DICT__();
+    for (auto &k : dict)
+      oss << k.first << "=" << k.second << ";";
+    return oss.str();
+  }
+
+  bool Find(std::string key, cudnnConvolutionFwdAlgo_t *fwd,
+            cudnnConvolutionBwdDataAlgo_t *bwd,
+            cudnnConvolutionBwdFilterAlgo_t *flt) {
+    std::lock_guard<std::mutex> guard(lock_);
+    auto i = reg_.find(key);
+    if (i != reg_.end()) {
+      *fwd = i->second.fwd;
+      *bwd = i->second.bwd;
+      *flt = i->second.flt;
+      return true;
+    }
+    return false;
+  }
+
+  void Register(std::string key, cudnnConvolutionFwdAlgo_t fwd,
+                cudnnConvolutionBwdDataAlgo_t bwd,
+                cudnnConvolutionBwdFilterAlgo_t flt) {
+    std::lock_guard<std::mutex> guard(lock_);
+    if (reg_.size() % 50 == 0) {
+      LOG(INFO) << "Running performance tests to find the best convolution "
+                   "algorithm, "
+                   "this can take a while... (setting env variable "
+                   "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
+      if (reg_.size() >= 1000) {
+        LOG(INFO)
+            << "If you see this message in the middle of training, you are "
+               "probably using bucketing. Consider setting env variable "
+               "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
+      }
+    }
+    reg_[key].fwd = fwd;
+    reg_[key].bwd = bwd;
+    reg_[key].flt = flt;
+  }
+
+  static CuDNNAlgoReg *Get();
+
+ private:
+  struct CudnnAlgorithms {
+    cudnnConvolutionFwdAlgo_t fwd;
+    cudnnConvolutionBwdDataAlgo_t bwd;
+    cudnnConvolutionBwdFilterAlgo_t flt;
+  };
+
+  std::mutex lock_;
+  std::unordered_map<std::string, CudnnAlgorithms> reg_;
+};
+#endif  // __CUDACC__ && CUDNN
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CUDNN_ALGOREG_INL_H_
diff --git a/src/operator/cudnn_convolution.cc b/src/operator/cudnn_algoreg.cc
similarity index 66%
rename from src/operator/cudnn_convolution.cc
rename to src/operator/cudnn_algoreg.cc
index 6e65a7262d73..103c4819d951 100644
--- a/src/operator/cudnn_convolution.cc
+++ b/src/operator/cudnn_algoreg.cc
@@ -1,10 +1,10 @@
 /*!
  * Copyright (c) 2015 by Contributors
- * \file cudnn_convolution.cc
+ * \file cudnn_algoreg.cc
  * \brief
  * \author Junyuan Xie
 */
-#include "./cudnn_convolution-inl.h"
+#include "./cudnn_algoreg-inl.h"
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 
@@ -14,8 +14,8 @@
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
-CuDNNAlgoReg* CuDNNAlgoReg::Get() {
-  static CuDNNAlgoReg* ptr = new CuDNNAlgoReg();
+CuDNNAlgoReg *CuDNNAlgoReg::Get() {
+  static CuDNNAlgoReg *ptr = new CuDNNAlgoReg();
   return ptr;
 }
 #endif  // CUDNN
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
old mode 100644
new mode 100755
index b9f34a32f221..0035fdec783b
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -23,12 +23,17 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar};
 }  // namespace cudnnbatchnorm
 
 #if defined(__CUDACC__)
+template<typename DType>
 class CuDNNBatchNormOp : public Operator {
  public:
   explicit CuDNNBatchNormOp(BatchNormParam param) {
+    using namespace mshadow;
     this->param_ = param;
     init_cudnn_ = false;
-    dtype_ = CUDNN_DATA_FLOAT;
+    dtype_ = DataType<DType>::kCudnnFlag;
+    // For float16 input type beta, gamma, mean, and average are stored in float32.
+    // For other input types, these parameters have the same type as input
+    dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType<DType>::kFlag;
   }
 
   ~CuDNNBatchNormOp() {
@@ -45,14 +50,14 @@ class CuDNNBatchNormOp : public Operator {
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(aux_states.size(), 2);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(aux_states.size(), 2U);
     if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 3);
-      CHECK_EQ(req.size(), 3);
+      CHECK_EQ(out_data.size(), 3U);
+      CHECK_EQ(req.size(), 3U);
     } else {
-      CHECK_GE(out_data.size(), 1);
-      CHECK_GE(req.size(), 1);
+      CHECK_GE(out_data.size(), 1U);
+      CHECK_GE(req.size(), 1U);
     }
     CHECK_EQ(req[cudnnbatchnorm::kOut], kWriteTo);
     CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2);
@@ -75,70 +80,75 @@ class CuDNNBatchNormOp : public Operator {
                                           shape_[1],
                                           shape_[2],
                                           shape_[3]), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptor(mean_desc_,
-                                          CUDNN_TENSOR_NCHW,
-                                          dtype_,
-                                          1,
-                                          shape_[1],
-                                          1,
-                                          1), CUDNN_STATUS_SUCCESS);
+      CHECK_EQ(cudnnDeriveBNTensorDescriptor(mean_desc_,
+                                             io_desc_,
+                                             CUDNN_BATCHNORM_SPATIAL), CUDNN_STATUS_SUCCESS);
       init_cudnn_  = true;
     }
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> x = in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
-    Tensor<gpu, 1> gamma =
-      in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 1> beta =
-      in_data[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 4> y = out_data[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, real_t>(shape_, s);
-    Tensor<gpu, 1> moving_mean =
-      aux_states[cudnnbatchnorm::kMovingMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 1> moving_inv_var =
-      aux_states[cudnnbatchnorm::kMovingInvVar]
-      .get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    float a = 1.0f, b = 0.0f;
-
-    if (param_.fix_gamma) gamma = 1.f;
-
-    if (ctx.is_train) {
-      Tensor<gpu, 1> save_mean =
-        out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-      Tensor<gpu, 1> save_inv_var =
-        out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-      CHECK_EQ(cudnnBatchNormalizationForwardTraining(s->dnn_handle_,
-                                                      CUDNN_BATCHNORM_SPATIAL,
-                                                      &a,
-                                                      &b,
-                                                      io_desc_,
-                                                      x.dptr_,
-                                                      io_desc_,
-                                                      y.dptr_,
-                                                      mean_desc_,
-                                                      gamma.dptr_,
-                                                      beta.dptr_,
-                                                      1 - param_.momentum,
-                                                      moving_mean.dptr_,
-                                                      moving_inv_var.dptr_,
-                                                      param_.eps,
-                                                      save_mean.dptr_,
-                                                      save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
-    } else {
-      CHECK_EQ(cudnnBatchNormalizationForwardInference(s->dnn_handle_,
-                                                       CUDNN_BATCHNORM_SPATIAL,
-                                                       &a,
-                                                       &b,
-                                                       io_desc_,
-                                                       x.dptr_,
-                                                       io_desc_,
-                                                       y.dptr_,
-                                                       mean_desc_,
-                                                       gamma.dptr_,
-                                                       beta.dptr_,
-                                                       moving_mean.dptr_,
-                                                       moving_inv_var.dptr_,
-                                                       param_.eps), CUDNN_STATUS_SUCCESS);
-    }
+    Tensor<gpu, 4, DType> x =
+      in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
+
+    Tensor<gpu, 4, DType> y =
+      out_data[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, DType>(shape_, s);
+
+    MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
+      Tensor<gpu, 1, DTypeParam> gamma =
+        in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> beta =
+        in_data[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> moving_mean =
+        aux_states[cudnnbatchnorm::kMovingMean]
+        .get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> moving_inv_var =
+        aux_states[cudnnbatchnorm::kMovingInvVar]
+        .get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      typename DataType<DType>::ScaleType a = 1.0f;
+      typename DataType<DType>::ScaleType b = 0.0f;
+
+      if (param_.fix_gamma) gamma = 1.f;
+
+      if (ctx.is_train) {
+        Tensor<gpu, 1, DTypeParam> save_mean =
+          out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+        Tensor<gpu, 1, DTypeParam> save_inv_var =
+          out_data[cudnnbatchnorm::kInvVar]
+          .get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+        CHECK_EQ(cudnnBatchNormalizationForwardTraining(s->dnn_handle_,
+                                                        CUDNN_BATCHNORM_SPATIAL,
+                                                        &a,
+                                                        &b,
+                                                        io_desc_,
+                                                        x.dptr_,
+                                                        io_desc_,
+                                                        y.dptr_,
+                                                        mean_desc_,
+                                                        gamma.dptr_,
+                                                        beta.dptr_,
+                                                        1 - param_.momentum,
+                                                        moving_mean.dptr_,
+                                                        moving_inv_var.dptr_,
+                                                        param_.eps,
+                                                        save_mean.dptr_,
+                                                        save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+      } else {
+        CHECK_EQ(cudnnBatchNormalizationForwardInference(s->dnn_handle_,
+                                                         CUDNN_BATCHNORM_SPATIAL,
+                                                         &a,
+                                                         &b,
+                                                         io_desc_,
+                                                         x.dptr_,
+                                                         io_desc_,
+                                                         y.dptr_,
+                                                         mean_desc_,
+                                                         gamma.dptr_,
+                                                         beta.dptr_,
+                                                         moving_mean.dptr_,
+                                                         moving_inv_var.dptr_,
+                                                         param_.eps), CUDNN_STATUS_SUCCESS);
+      }
+    })
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -150,79 +160,107 @@ class CuDNNBatchNormOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(out_data.size(), 3);
-    CHECK_EQ(in_grad.size(), 3);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(out_data.size(), 3U);
+    CHECK_EQ(in_grad.size(), 3U);
     CHECK(ctx.is_train && !param_.use_global_stats)
         << "use global statistics is not yet supported in CuDNNBatchNorm";
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> x = in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
-    Tensor<gpu, 4> dx = in_grad[cudnnbatchnorm::kData].get_with_shape<gpu, 4, real_t>(shape_, s);
-    Tensor<gpu, 4> dy = out_grad[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, real_t>(shape_, s);
-    Tensor<gpu, 1> gamma =
-      in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 1> dbeta =
-      in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 1> dgamma =
-      in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 1> save_mean =
-      out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    Tensor<gpu, 1> save_inv_var =
-      out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, real_t>(Shape1(shape_[1]), s);
-    float a = 1.0f;
-    float b = 0.0f;
-    float b_add = 1.0f;
-    CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-
-    if (param_.fix_gamma) gamma = 1.f;
+    Tensor<gpu, 4, DType> x =
+      in_data[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
+    Tensor<gpu, 4, DType> dx =
+      in_grad[cudnnbatchnorm::kData].get_with_shape<gpu, 4, DType>(shape_, s);
+    Tensor<gpu, 4, DType> dy =
+      out_grad[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, DType>(shape_, s);
 
 #if CUDNN_VERSION >= 4007
-    CHECK_EQ(cudnnBatchNormalizationBackward(s->dnn_handle_,
-                                             CUDNN_BATCHNORM_SPATIAL,
-                                             &a,
-                                             &b,
-                                             &a,
-                                             req[cudnnbatchnorm::kGamma] == kWriteTo ? &b: &b_add,
-                                             io_desc_,
-                                             x.dptr_,
-                                             io_desc_,
-                                             dy.dptr_,
-                                             io_desc_,
-                                             dx.dptr_,
-                                             mean_desc_,
-                                             gamma.dptr_,
-                                             dgamma.dptr_,
-                                             dbeta.dptr_,
-                                             param_.eps,
-                                             save_mean.dptr_,
-                                             save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+    MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
+      Tensor<gpu, 1, DTypeParam> gamma =
+        in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> dbeta =
+        in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> dgamma =
+        in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> save_mean =
+        out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> save_inv_var =
+        out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+
+      typename DataType<DType>::ScaleType a = 1.0f;
+      typename DataType<DType>::ScaleType b = 0.0f;
+      typename DataType<DType>::ScaleType b_add = 1.0f;
+      CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+
+      if (param_.fix_gamma) gamma = 1.f;
+
+      CHECK_EQ(cudnnBatchNormalizationBackward(s->dnn_handle_,
+                                               CUDNN_BATCHNORM_SPATIAL,
+                                               &a,
+                                               &b,
+                                               &a,
+                                               req[cudnnbatchnorm::kGamma] == kWriteTo ? &b: &b_add,
+                                               io_desc_,
+                                               x.dptr_,
+                                               io_desc_,
+                                               dy.dptr_,
+                                               io_desc_,
+                                               dx.dptr_,
+                                               mean_desc_,
+                                               gamma.dptr_,
+                                               dgamma.dptr_,
+                                               dbeta.dptr_,
+                                               param_.eps,
+                                               save_mean.dptr_,
+                                               save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+      if (param_.fix_gamma) dgamma = 0.f;
+    })
 #else  // CUDNN_VERSION < 4007
-    CHECK_EQ(cudnnBatchNormalizationBackward(s->dnn_handle_,
-                                             CUDNN_BATCHNORM_SPATIAL,
-                                             &a,
-                                             &b,
-                                             io_desc_,
-                                             x.dptr_,
-                                             io_desc_,
-                                             dy.dptr_,
-                                             io_desc_,
-                                             dx.dptr_,
-                                             mean_desc_,
-                                             gamma.dptr_,
-                                             dgamma.dptr_,
-                                             dbeta.dptr_,
-                                             param_.eps,
-                                             save_mean.dptr_,
-                                             save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+    MSHADOW_REAL_TYPE_SWITCH(dtype_param_, DTypeParam, {
+      Tensor<gpu, 1, DTypeParam> gamma =
+        in_data[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> dbeta =
+        in_grad[cudnnbatchnorm::kBeta].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> dgamma =
+        in_grad[cudnnbatchnorm::kGamma].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> save_mean =
+        out_data[cudnnbatchnorm::kMean].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+      Tensor<gpu, 1, DTypeParam> save_inv_var =
+        out_data[cudnnbatchnorm::kInvVar].get_with_shape<gpu, 1, DTypeParam>(Shape1(shape_[1]), s);
+
+      typename DataType<DType>::ScaleType a = 1.0f;
+      typename DataType<DType>::ScaleType b = 0.0f;
+      typename DataType<DType>::ScaleType b_add = 1.0f;
+      CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
+
+      if (param_.fix_gamma) gamma = 1.f;
+      CHECK_EQ(cudnnBatchNormalizationBackward(s->dnn_handle_,
+                                               CUDNN_BATCHNORM_SPATIAL,
+                                               &a,
+                                               &b,
+                                               io_desc_,
+                                               x.dptr_,
+                                               io_desc_,
+                                               dy.dptr_,
+                                               io_desc_,
+                                               dx.dptr_,
+                                               mean_desc_,
+                                               gamma.dptr_,
+                                               dgamma.dptr_,
+                                               dbeta.dptr_,
+                                               param_.eps,
+                                               save_mean.dptr_,
+                                               save_inv_var.dptr_), CUDNN_STATUS_SUCCESS);
+      if (param_.fix_gamma) dgamma = 0.f;
+    })
 #endif
-    if (param_.fix_gamma) dgamma = 0.f;
   }
 
  private:
   bool init_cudnn_;
   cudnnDataType_t dtype_;
+  int dtype_param_;
   cudnnTensorDescriptor_t io_desc_, mean_desc_;
   mshadow::Shape<4> shape_;
   BatchNormParam param_;
@@ -248,7 +286,7 @@ class CuDNNBatchNormProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3) << "Input:[data, gamma, beta]";
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     in_shape->at(1) = TShape(Shape1(dshape[1]));
diff --git a/src/operator/cudnn_batch_norm.cu b/src/operator/cudnn_batch_norm.cu
old mode 100644
new mode 100755
index c02fbdea5593..3ab43cabd6cd
--- a/src/operator/cudnn_batch_norm.cu
+++ b/src/operator/cudnn_batch_norm.cu
@@ -13,7 +13,7 @@ namespace op {
 #if CUDNN_MAJOR == 4
 template<>
 Operator *CreateOp_CuDNNv4<gpu>(BatchNormParam param) {
-  return new CuDNNBatchNormOp(param);
+  return new CuDNNBatchNormOp<float>(param);
 }
 #endif  // CUDNN_MAJOR == 4
 }  // namespace op
diff --git a/src/operator/cudnn_bilinear_sampler-inl.h b/src/operator/cudnn_bilinear_sampler-inl.h
index 5df368210ed4..c71fea532afe 100644
--- a/src/operator/cudnn_bilinear_sampler-inl.h
+++ b/src/operator/cudnn_bilinear_sampler-inl.h
@@ -38,8 +38,8 @@ class CuDNNBilinearSamplerOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     CHECK_EQ(req[bs::kOut], kWriteTo);
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 2U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
 
     Tensor<gpu, 4, DType> data = in_data[bs::kData].get<gpu, 4, DType>(s);
@@ -77,9 +77,9 @@ class CuDNNBilinearSamplerOp : public Operator {
     using namespace mshadow;
     CHECK_NE(req[bs::kData], kWriteInplace);
     CHECK_NE(req[bs::kGrid], kWriteInplace);
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 2);
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 2U);
+    CHECK_EQ(out_grad.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data = in_data[bs::kData].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> grid_tmp = out_data[bs::kTmp].get<gpu, 4, DType>(s);
@@ -117,8 +117,8 @@ class CuDNNBilinearSamplerOp : public Operator {
     #if CUDNN_MAJOR == 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 2U);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       Tensor<gpu, 4, DType> data = in_data[bs::kData].get<gpu, 4, DType>(s);
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index ea516789180d..73fbeac6371b 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -12,75 +12,13 @@
 #include <mutex>
 #include <string>
 #include "./convolution-inl.h"
+#include "./cudnn_algoreg-inl.h"
 #include "../common/cuda_utils.h"
 
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
 
-class CuDNNAlgoReg {
- public:
-  std::string GetKey(const ConvolutionParam& param,
-                     const std::vector<TShape>& in_shape,
-                     const std::vector<TShape>& out_shape) {
-    std::ostringstream oss;
-    for (auto& i : in_shape) oss << i << ";";
-    for (auto& i : out_shape) oss << i << ";";
-    auto dict = param.__DICT__();
-    for (auto& k : dict) oss << k.first << "=" << k.second << ";";
-    return oss.str();
-  }
-
-  bool Find(std::string key,
-            cudnnConvolutionFwdAlgo_t *fwd,
-            cudnnConvolutionBwdDataAlgo_t *bwd,
-            cudnnConvolutionBwdFilterAlgo_t *flt) {
-    std::lock_guard<std::mutex> guard(lock_);
-    auto i = reg_.find(key);
-    if (i != reg_.end()) {
-      *fwd = i->second.fwd;
-      *bwd = i->second.bwd;
-      *flt = i->second.flt;
-      return true;
-    }
-    return false;
-  }
-
-  void Register(std::string key,
-                cudnnConvolutionFwdAlgo_t fwd,
-                cudnnConvolutionBwdDataAlgo_t bwd,
-                cudnnConvolutionBwdFilterAlgo_t flt) {
-    std::lock_guard<std::mutex> guard(lock_);
-    if (reg_.size() % 50 == 0) {
-      LOG(INFO)
-        << "Running performance tests to find the best convolution algorithm, "
-           "this can take a while... (setting env variable "
-           "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)";
-      if (reg_.size() >= 1000) {
-        LOG(INFO)
-          << "If you see this message in the middle of training, you are "
-             "probably using bucketing. Consider setting env variable "
-             "MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable cudnn tuning.";
-      }
-    }
-    reg_[key].fwd = fwd;
-    reg_[key].bwd = bwd;
-    reg_[key].flt = flt;
-  }
-
-  static CuDNNAlgoReg* Get();
-
- private:
-  struct CudnnAlgorithms {
-    cudnnConvolutionFwdAlgo_t fwd;
-    cudnnConvolutionBwdDataAlgo_t bwd;
-    cudnnConvolutionBwdFilterAlgo_t flt;
-  };
-
-  std::mutex lock_;
-  std::unordered_map<std::string, CudnnAlgorithms> reg_;
-};
-
 template<typename DType>
 class CuDNNConvolutionOp : public Operator {
  public:
@@ -134,7 +72,7 @@ class CuDNNConvolutionOp : public Operator {
     DType *wmat_ptr = NULL;
     DType *out_ptr = NULL;
     CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace =
@@ -219,7 +157,7 @@ class CuDNNConvolutionOp : public Operator {
     DType *gwmat_ptr = NULL;
     DType *data_ptr = NULL;
     DType *gdata_ptr = NULL;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     if (param_.kernel.ndim() == 2) {
@@ -331,7 +269,7 @@ class CuDNNConvolutionOp : public Operator {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_shape.size(), expected);
-    CHECK_EQ(out_shape.size(), 1);
+    CHECK_EQ(out_shape.size(), 1U);
     CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
     CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
     CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
index e0f66af1b24f..fd820fe6ce92 100644
--- a/src/operator/cudnn_deconvolution-inl.h
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -1,29 +1,55 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file cudnn_deconvolution-inl.h
  * \brief
- * \author Wei Wu
+ * \author Wei Wu, Leonard Lausen
 */
 #ifndef MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
 #define MXNET_OPERATOR_CUDNN_DECONVOLUTION_INL_H_
 
 #include <algorithm>
 #include <vector>
+#include <mutex>
+#include <string>
 #include "./deconvolution-inl.h"
+#include "./cudnn_algoreg-inl.h"
+#include "../common/cuda_utils.h"
 
 namespace mxnet {
 namespace op {
-#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1
+#if MXNET_USE_CUDNN == 1
+
 template<typename DType>
 class CuDNNDeconvolutionOp : public Operator {
  public:
-  explicit CuDNNDeconvolutionOp(DeconvolutionParam param) {
+  explicit CuDNNDeconvolutionOp(DeconvolutionParam param,
+                                const std::vector<TShape>& in_shape,
+                                const std::vector<TShape>& out_shape,
+                                const Context& ctx) {
+    using namespace mshadow;
     this->param_ = param;
+
     // convert MB to words
     param_.workspace = (param_.workspace << 20) / sizeof(DType);
     init_cudnn_ = false;
-    // TODO(xxx): fp16
+    init_temp_size_ = false;
     dtype_ = mshadow::DataType<DType>::kCudnnFlag;
+
+#if CUDNN_MAJOR >= 5
+    MSHADOW_LAYOUT_SWITCH(param_.layout.value(), Layout, {
+        format_ = LayoutType<Layout>::kCudnnFlag;
+      });
+#else
+    CHECK(param_.layout.value() == kNCHW || param_.layout.value() == kNCDHW)
+      << "Need CuDNN > 5.0 for layout support";
+#endif
+
+    InitDescriptors(ctx, in_shape, out_shape);
+
+    if (!param_.cudnn_tune) {
+      param_.cudnn_tune = dmlc::GetEnv("MXNET_CUDNN_AUTOTUNE_DEFAULT", 1);
+    }
+    SelectAlgo(ctx, in_shape, out_shape);
   }
 
   ~CuDNNDeconvolutionOp() {
@@ -43,22 +69,39 @@ class CuDNNDeconvolutionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     size_t expected = param_.no_bias ? 2 : 3;
+    DType *data_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *out_ptr = NULL;
     CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
-
-    CHECK_EQ(data.CheckContiguous(), true);
-    CHECK_EQ(wmat.CheckContiguous(), true);
-    CHECK_EQ(out.CheckContiguous(), true);
-    if (!init_cudnn_) {
-      Init(s, in_data, out_data);
-    }
+    GetTempSize(ctx);
     Tensor<gpu, 1, DType> workspace =
-        ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
-                                 mshadow::Shape1(forward_workspace_), s);
+      ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
+        mshadow::Shape1(forward_workspace_), s);
+
+    if (param_.kernel.ndim() == 2) {
+      Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(wmat.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+      wmat_ptr = wmat.dptr_;
+      out_ptr = out.dptr_;
+    } else {
+      Tensor<gpu, 5, DType> data = in_data[deconv::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> wmat = in_data[deconv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> out = out_data[deconv::kOut].get<gpu, 5, DType>(s);
+      CHECK_EQ(data.CheckContiguous(), true);
+      CHECK_EQ(wmat.CheckContiguous(), true);
+      CHECK_EQ(out.CheckContiguous(), true);
+      data_ptr = data.dptr_;
+      wmat_ptr = wmat.dptr_;
+      out_ptr = out.dptr_;
+    }
+
     for (uint32_t g = 0; g < param_.num_group; ++g) {
       typename DataType<DType>::ScaleType alpha = 1.0f;
       typename DataType<DType>::ScaleType beta  = 0.0f;
@@ -66,30 +109,30 @@ class CuDNNDeconvolutionOp : public Operator {
       CHECK_EQ(cudnnConvolutionBackwardData_v3(s->dnn_handle_,
                &alpha,
                filter_desc_,
-               wmat.dptr_ + weight_offset_ * g,
+               wmat_ptr + weight_offset_ * g,
                in_desc_,
-               data.dptr_ + data_offset_ * g,
+               data_ptr + data_offset_ * g,
                conv_desc_,
                back_algo_,
                workspace.dptr_,
                backward_workspace_byte_,
                &beta,
                out_desc_,
-               out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+               out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
       #elif CUDNN_MAJOR == 5
       CHECK_EQ(cudnnConvolutionBackwardData(s->dnn_handle_,
                &alpha,
                filter_desc_,
-               wmat.dptr_ + weight_offset_ * g,
+               wmat_ptr + weight_offset_ * g,
                in_desc_,
-               data.dptr_ + data_offset_ * g,
+               data_ptr + data_offset_ * g,
                conv_desc_,
                back_algo_,
                workspace.dptr_,
                backward_workspace_byte_,
                &beta,
                out_desc_,
-               out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+               out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
       #endif
       if (!param_.no_bias) {
         beta = 1.0f;
@@ -101,7 +144,7 @@ class CuDNNDeconvolutionOp : public Operator {
                                 bias.dptr_ + bias_offset_ * g,
                                 &beta,
                                 out_desc_,
-                                out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+                                out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
 #endif
 #if CUDNN_MAJOR == 3
         CHECK_EQ(cudnnAddTensor(s->dnn_handle_,
@@ -111,7 +154,7 @@ class CuDNNDeconvolutionOp : public Operator {
                                 bias.dptr_ + bias_offset_ * g,
                                 &beta,
                                 out_desc_,
-                                out.dptr_ + out_offset_ * g), CUDNN_STATUS_SUCCESS);
+                                out_ptr + out_offset_ * g), CUDNN_STATUS_SUCCESS);
 #endif
       }
     }
@@ -127,17 +170,40 @@ class CuDNNDeconvolutionOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK_EQ(out_grad.size(), 1);
+    DType *grad_ptr = NULL;
+    DType *wmat_ptr = NULL;
+    DType *gwmat_ptr = NULL;
+    DType *data_ptr = NULL;
+    DType *gdata_ptr = NULL;
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK(in_data.size() == expected && in_grad.size() == expected);
+    Stream<gpu> *s = ctx.get_stream<gpu>();
+    if (param_.kernel.ndim() == 2) {
+      Tensor<gpu, 4, DType> grad = out_grad[deconv::kOut].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> gwmat = in_grad[deconv::kWeight].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> gdata = in_grad[deconv::kData].get<gpu, 4, DType>(s);
+      grad_ptr = grad.dptr_;
+      wmat_ptr = wmat.dptr_;
+      gwmat_ptr = gwmat.dptr_;
+      data_ptr = data.dptr_;
+      gdata_ptr = gdata.dptr_;
+    } else {
+      Tensor<gpu, 5, DType> grad = out_grad[deconv::kOut].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> wmat = in_data[deconv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> gwmat = in_grad[deconv::kWeight].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> data = in_data[deconv::kData].get<gpu, 5, DType>(s);
+      Tensor<gpu, 5, DType> gdata = in_grad[deconv::kData].get<gpu, 5, DType>(s);
+      grad_ptr = grad.dptr_;
+      wmat_ptr = wmat.dptr_;
+      gwmat_ptr = gwmat.dptr_;
+      data_ptr = data.dptr_;
+      gdata_ptr = gdata.dptr_;
+    }
     CHECK_NE(req[deconv::kWeight], kWriteInplace);
     CHECK_NE(req[deconv::kBias], kWriteInplace);
     CHECK_NE(req[deconv::kData], kWriteInplace);
-    Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4, DType> grad = out_grad[deconv::kOut].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> wmat = in_data[deconv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> gwmat = in_grad[deconv::kWeight].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
-    Tensor<gpu, 4, DType> gdata = in_grad[deconv::kData].get<gpu, 4, DType>(s);
     Tensor<gpu, 1, DType> workspace =
         ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
                                  mshadow::Shape1(backward_workspace_), s);
@@ -155,7 +221,7 @@ class CuDNNDeconvolutionOp : public Operator {
         CHECK_EQ(cudnnConvolutionBackwardBias(s->dnn_handle_,
                                               &alpha,
                                               out_desc_,
-                                              grad.dptr_ + out_offset_ * g,
+                                              grad_ptr + out_offset_ * g,
                                               &bias_beta,
                                               bias_desc_,
                                               gbias.dptr_ + bias_offset_ * g),
@@ -166,187 +232,341 @@ class CuDNNDeconvolutionOp : public Operator {
         CHECK_EQ(cudnnConvolutionBackwardFilter_v3(s->dnn_handle_,
                  &alpha,
                  out_desc_,
-                 grad.dptr_ + out_offset_ * g,
+                 grad_ptr + out_offset_ * g,
                  in_desc_,
-                 data.dptr_ + data_offset_ * g,
+                 data_ptr + data_offset_ * g,
                  conv_desc_,
                  back_algo_w_,
                  workspace.dptr_,
                  backward_workspace_byte_,
                  &weight_beta,
                  filter_desc_,
-                 gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
+                 gwmat_ptr + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
         #elif CUDNN_MAJOR == 5
         CHECK_EQ(cudnnConvolutionBackwardFilter(s->dnn_handle_,
                  &alpha,
                  out_desc_,
-                 grad.dptr_ + out_offset_ * g,
+                 grad_ptr + out_offset_ * g,
                  in_desc_,
-                 data.dptr_ + data_offset_ * g,
+                 data_ptr + data_offset_ * g,
                  conv_desc_,
                  back_algo_w_,
                  workspace.dptr_,
                  backward_workspace_byte_,
                  &weight_beta,
                  filter_desc_,
-                 gwmat.dptr_ + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
+                 gwmat_ptr + weight_offset_ * g), CUDNN_STATUS_SUCCESS);
         #endif
       }
       if (req[deconv::kData] != kNullOp) {
         CHECK_EQ(cudnnConvolutionForward(s->dnn_handle_,
                                          &alpha,
                                          out_desc_,
-                                         grad.dptr_ + out_offset_ * g,
+                                         grad_ptr + out_offset_ * g,
                                          filter_desc_,
-                                         wmat.dptr_ + weight_offset_ * g,
+                                         wmat_ptr + weight_offset_ * g,
                                          conv_desc_,
                                          algo_,
                                          workspace.dptr_,
                                          forward_workspace_byte_,
                                          &data_beta,
                                          in_desc_,
-                                         gdata.dptr_ + data_offset_ * g), CUDNN_STATUS_SUCCESS);
+                                         gdata_ptr + data_offset_ * g), CUDNN_STATUS_SUCCESS);
       }
     }
   }
 
  private:
-  inline void Init(mshadow::Stream<gpu> *s,
-                   const std::vector<TBlob> &in_data,
-                   const std::vector<TBlob> &out_data) {
+  inline void InitDescriptors(const Context& ctx,
+                              const std::vector<TShape> &in_shape,
+                              const std::vector<TShape> &out_shape) {
     using namespace mshadow;
-    #if CUDNN_MAJOR == 5
-    format_ = CUDNN_TENSOR_NCHW;
-    #endif
     size_t expected = param_.no_bias ? 2 : 3;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
-    if (!init_cudnn_) {
-      init_cudnn_ = true;
-      size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
-      size_t back_size = 0;
-      size_t back_size_w = 0;
-      Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
-      Tensor<gpu, 4, DType> out = out_data[deconv::kOut].get<gpu, 4, DType>(s);
-      index_t pad_y, pad_x, adj_y, adj_x;
-      param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
-      data_offset_ = data.shape_[1] / param_.num_group * data.shape_[2] * data.shape_[3];
-      out_offset_ = out.shape_[1] /param_.num_group * out.shape_[2] * out.shape_[3];
-      weight_offset_ = data.shape_[1] / param_.num_group * param_.num_filter / param_.num_group
-                       * param_.kernel[0] * param_.kernel[1];
-      CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc_), CUDNN_STATUS_SUCCESS);
-      #if CUDNN_MAJOR <=4
-      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
-                                          dtype_,
-                                          data.shape_[1] / param_.num_group,
-                                          param_.num_filter / param_.num_group,
-                                          param_.kernel[0],
-                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
-      #elif CUDNN_MAJOR ==5
-      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
-                                          dtype_,
-                                          format_,
-                                          data.shape_[1] / param_.num_group,
-                                          param_.num_filter / param_.num_group,
-                                          param_.kernel[0],
-                                          param_.kernel[1]), CUDNN_STATUS_SUCCESS);
-      #endif
+    CHECK_EQ(in_shape.size(), expected);
+    CHECK_EQ(out_shape.size(), 1U);
+    CHECK_EQ(cudnnCreateTensorDescriptor(&in_desc_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnCreateTensorDescriptor(&out_desc_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnCreateTensorDescriptor(&bias_desc_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnCreateFilterDescriptor(&filter_desc_), CUDNN_STATUS_SUCCESS);
+    CHECK_EQ(cudnnCreateConvolutionDescriptor(&conv_desc_), CUDNN_STATUS_SUCCESS);
+
+    TShape dshape = in_shape[deconv::kData];
+    TShape wshape = in_shape[deconv::kWeight];
+    TShape oshape = out_shape[deconv::kOut];
+    TShape dstride, ostride;
+    wshape[0] /= param_.num_group;
+
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      index_t o_pad[2];
+      index_t o_adj[2];
+      param_.InferPad(dshape, o_pad, o_adj);
+
       CHECK_EQ(cudnnSetConvolution2dDescriptor(conv_desc_,
-                                               pad_y,
-                                               pad_x,
+                                               o_pad[0],
+                                               o_pad[1],
                                                param_.stride[0],
                                                param_.stride[1],
                                                1,
                                                1,
                                                CUDNN_CROSS_CORRELATION), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptorEx(in_desc_,
-                                            dtype_,
-                                            data.shape_[0],
-                                            data.shape_[1] / param_.num_group,
-                                            data.shape_[2],
-                                            data.shape_[3],
-                                            data.shape_[1] * data.shape_[2] * data.shape_[3],
-                                            data.shape_[2] * data.shape_[3],
-                                            data.shape_[3],
-                                            1), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnSetTensor4dDescriptorEx(out_desc_,
-                                            dtype_,
-                                            out.shape_[0],
-                                            out.shape_[1] / param_.num_group,
-                                            out.shape_[2],
-                                            out.shape_[3],
-                                            out.shape_[1] * out.shape_[2] * out.shape_[3],
-                                            out.shape_[2] * out.shape_[3],
-                                            out.shape_[3],
-                                            1), CUDNN_STATUS_SUCCESS);
-      if (!param_.no_bias) {
-        Tensor<gpu, 1, DType> bias = in_data[deconv::kBias].get<gpu, 1, DType>(s);
-        bias_offset_ = bias.shape_[0] / param_.num_group;
-        CHECK_EQ(cudnnSetTensor4dDescriptor(bias_desc_,
-                                            CUDNN_TENSOR_NCHW,
-                                            dtype_,
-                                            1,
-                                            bias.shape_[0] / param_.num_group,
-                                            1,
-                                            1), CUDNN_STATUS_SUCCESS);
+
+      #if CUDNN_MAJOR >= 5
+      wshape = ConvertLayout(wshape.get<4>(), param_.layout.value(), kNCHW);
+      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
+                                          dtype_,
+                                          format_,
+                                          wshape[0],
+                                          wshape[1],
+                                          wshape[2],
+                                          wshape[3]), CUDNN_STATUS_SUCCESS);
+      #else
+      CHECK_EQ(param_.layout.value(), kNCHW) << "CuDNN V4 only support NCHW layout";
+      CHECK_EQ(cudnnSetFilter4dDescriptor(filter_desc_,
+                                          dtype_,
+                                          wshape[0],
+                                          wshape[1],
+                                          wshape[2],
+                                          wshape[3]), CUDNN_STATUS_SUCCESS);
+      #endif
+
+      dstride = ConvertLayout(Shape4(dshape[1] * dshape[2] * dshape[3],
+                                     dshape[2] * dshape[3],
+                                     dshape[3],
+                                     1),
+                              param_.layout.value(), kNCHW);
+      dshape = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+
+      ostride = ConvertLayout(Shape4(oshape[1] * oshape[2] * oshape[3],
+                                     oshape[2] * oshape[3],
+                                     oshape[3],
+                                     1),
+                              param_.layout.value(), kNCHW);
+      oshape = ConvertLayout(oshape.get<4>(), param_.layout.value(), kNCHW);
+    } else if (param_.kernel.ndim() == 3) {
+      // 3d conv
+      std::vector<int> upscale_vec = {1, 1, 1};
+
+      index_t o_pad[3];
+      index_t o_adj[3];
+      param_.InferPad(dshape, o_pad, o_adj);
+
+      #if CUDNN_MAJOR >= 5
+      CHECK_EQ(param_.layout.value(), kNCDHW) << "CuDNN only support 3D conv with NCDHW layout";
+      CHECK_EQ(cudnnSetFilterNdDescriptor(filter_desc_,
+                                          dtype_,
+                                          CUDNN_TENSOR_NCHW,
+                                          static_cast<int>(wshape.ndim()),
+                                          reinterpret_cast<int*>(&wshape[0])),
+               CUDNN_STATUS_SUCCESS);
+      #else
+      LOG(FATAL) << "Only support CUDNN V5 for 3D convolution";
+      #endif
+      CHECK_EQ(cudnnSetConvolutionNdDescriptor(conv_desc_,
+                                               3,
+                                               reinterpret_cast<int*>(&o_pad[0]),
+                                               reinterpret_cast<int*>(&param_.stride[0]),
+                                               &upscale_vec[0],
+                                               CUDNN_CROSS_CORRELATION,
+                                               dtype_), CUDNN_STATUS_SUCCESS);
+
+      dstride = ConvertLayout(Shape5(dshape[1] * dshape[2] * dshape[3] * dshape[4],
+                                     dshape[2] * dshape[3] * dshape[4],
+                                     dshape[3] * dshape[4],
+                                     dshape[4],
+                                     1),
+                              param_.layout.value(), kNCDHW);
+      dshape = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+
+      ostride = ConvertLayout(Shape5(oshape[1] * oshape[2] * oshape[3] * oshape[4],
+                                     oshape[2] * oshape[3] * oshape[4],
+                                     oshape[3] * oshape[4],
+                                     oshape[4],
+                                     1),
+                              param_.layout.value(), kNCDHW);
+      oshape = ConvertLayout(oshape.get<5>(), param_.layout.value(), kNCDHW);
+    }
+    dshape[1] /= param_.num_group;
+    oshape[1] /= param_.num_group;
+    weight_offset_ = wshape.Size();
+    data_offset_ = dstride[1] * dshape[1];
+    out_offset_ = ostride[1] * oshape[1];
+
+    CHECK_EQ(cudnnSetTensorNdDescriptor(in_desc_,
+                                        dtype_,
+                                        static_cast<int>(dshape.ndim()),
+                                        reinterpret_cast<int*>(&dshape[0]),
+                                        reinterpret_cast<int*>(&dstride[0])),
+             CUDNN_STATUS_SUCCESS);
+
+    CHECK_EQ(cudnnSetTensorNdDescriptor(out_desc_,
+                                        dtype_,
+                                        static_cast<int>(oshape.ndim()),
+                                        reinterpret_cast<int*>(&oshape[0]),
+                                        reinterpret_cast<int*>(&ostride[0])),
+             CUDNN_STATUS_SUCCESS);
+
+    if (!param_.no_bias) {
+      TShape bias = in_shape[deconv::kBias];
+      bias_offset_ = bias[0] / param_.num_group;
+      std::vector<int> bias_shape = {1,
+                                     static_cast<int>(bias[0] / param_.num_group),
+                                     1, 1};
+      std::vector<int> bias_stride = {static_cast<int>(bias_offset_), 1, 1, 1};
+      if (param_.kernel.ndim() == 3) {
+        bias_shape.push_back(1);
+        bias_stride.push_back(1);
       }
+      CHECK_EQ(cudnnSetTensorNdDescriptor(bias_desc_,
+                                          dtype_,
+                                          static_cast<int>(bias_shape.size()),
+                                          &bias_shape[0],
+                                          &bias_stride[0]), CUDNN_STATUS_SUCCESS);
+    }
+    init_cudnn_ = true;
+  }
+
+  void SelectAlgo(const Context& ctx,
+                  const std::vector<TShape>& in_shape,
+                  const std::vector<TShape>& out_shape) {
+    std::string key = CuDNNAlgoReg::Get()->GetKey(param_, in_shape, out_shape);
+    if (CuDNNAlgoReg::Get()->Find(key, &algo_, &back_algo_, &back_algo_w_)) return;
+
+    Engine::VarHandle var = Engine::Get()->NewVariable();
+    Engine::Get()->PushSync([=](RunContext rctx) {
+      mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
       CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
-      CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
-               out_desc_,
-               filter_desc_,
-               conv_desc_,
-               in_desc_,
-               CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &algo_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
-               out_desc_,
-               in_desc_,
-               conv_desc_,
-               filter_desc_,
-               CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &back_algo_w_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
-               filter_desc_,
-               in_desc_,
-               conv_desc_,
-               out_desc_,
-               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-               workspace_byte,
-               &back_algo_), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
+      size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
+      if (!param_.cudnn_tune.value()) {
+        CHECK_EQ(cudnnGetConvolutionForwardAlgorithm(s->dnn_handle_,
+                 out_desc_,
+                 filter_desc_,
+                 conv_desc_,
+                 in_desc_,
+                 CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &(this->algo_)), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+                 out_desc_,
+                 in_desc_,
+                 conv_desc_,
+                 filter_desc_,
+                 CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &(this->back_algo_w_)), CUDNN_STATUS_SUCCESS);
+        CHECK_EQ(cudnnGetConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+                 filter_desc_,
+                 in_desc_,
+                 conv_desc_,
+                 out_desc_,
+                 CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                 workspace_byte,
+                 &(this->back_algo_)), CUDNN_STATUS_SUCCESS);
+      } else {
+        const int kMaxAlgos = 10;
+        int nalgo = kMaxAlgos;
+        int i;
+
+        cudnnConvolutionFwdAlgoPerf_t fwd_algo[kMaxAlgos];
+        CHECK_EQ(cudnnFindConvolutionForwardAlgorithm(s->dnn_handle_,
+                 out_desc_,
+                 filter_desc_,
+                 conv_desc_,
+                 in_desc_,
+                 kMaxAlgos,
+                 &nalgo,
+                 fwd_algo), CUDNN_STATUS_SUCCESS);
+        i = 0;
+        while (i < nalgo
+               && (fwd_algo[i].status != CUDNN_STATUS_SUCCESS
+               || (param_.cudnn_tune.value() == deconv::kLimited
+               && fwd_algo[i].memory > workspace_byte))) ++i;
+        if (i == nalgo) {
+          LOG(FATAL) << "Failed to find an convolution algorithm.";
+        } else {
+          this->algo_ = fwd_algo[i].algo;
+        }
+
+        cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_algo[kMaxAlgos];
+        CHECK_EQ(cudnnFindConvolutionBackwardFilterAlgorithm(s->dnn_handle_,
+                 out_desc_,
+                 in_desc_,
+                 conv_desc_,
+                 filter_desc_,
+                 kMaxAlgos,
+                 &nalgo,
+                 bwd_filter_algo), CUDNN_STATUS_SUCCESS);
+        i = 0;
+        while (i < nalgo
+               && (bwd_filter_algo[i].status != CUDNN_STATUS_SUCCESS
+               || (param_.cudnn_tune.value() == deconv::kLimited
+               && bwd_filter_algo[i].memory > workspace_byte))) ++i;
+        if (i == nalgo) {
+          LOG(FATAL) << "Failed to find an convolution algorithm.";
+        } else {
+          this->back_algo_w_ = bwd_filter_algo[i].algo;
+        }
+
+        cudnnConvolutionBwdDataAlgoPerf_t bwd_data_algo[kMaxAlgos];
+        CHECK_EQ(cudnnFindConvolutionBackwardDataAlgorithm(s->dnn_handle_,
+                 filter_desc_,
+                 in_desc_,
+                 conv_desc_,
+                 out_desc_,
+                 kMaxAlgos,
+                 &nalgo,
+                 bwd_data_algo), CUDNN_STATUS_SUCCESS);
+        i = 0;
+        while (i < nalgo
+               && (bwd_data_algo[i].status != CUDNN_STATUS_SUCCESS
+               || (param_.cudnn_tune.value() == deconv::kLimited
+               && bwd_data_algo[i].memory > workspace_byte))) ++i;
+        if (i == nalgo) {
+          LOG(FATAL) << "Failed to find an convolution algorithm.";
+        } else {
+          this->back_algo_ = bwd_data_algo[i].algo;
+        }
+        CuDNNAlgoReg::Get()->Register(key, this->algo_, this->back_algo_, this->back_algo_w_);
+      }
+    }, ctx, {}, {var});
+    Engine::Get()->WaitForVar(var);
+    Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
+  }
+
+  void GetTempSize(const OpContext& ctx) {
+    if (init_temp_size_) return;
+    mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+    size_t back_size = 0, back_size_w = 0;
+    CHECK_EQ(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
                filter_desc_,
                in_desc_,
                conv_desc_,
                out_desc_,
                back_algo_,
                &back_size), CUDNN_STATUS_SUCCESS);
-      CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
+    CHECK_EQ(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
                out_desc_,
                in_desc_,
                conv_desc_,
                filter_desc_,
                back_algo_w_,
                &back_size_w), CUDNN_STATUS_SUCCESS);
-      backward_workspace_byte_ = std::max(back_size, back_size_w);
-      CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
+    backward_workspace_byte_ = std::max(back_size, back_size_w);
+    CHECK_EQ(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
                out_desc_,
                filter_desc_,
                conv_desc_,
                in_desc_,
                algo_,
                &forward_workspace_byte_), CUDNN_STATUS_SUCCESS);
-      forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
-      backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
-    }
+
+    forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
+    backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
+    init_temp_size_ = true;
   }
 
   bool init_cudnn_;
+  bool init_temp_size_;
   size_t forward_workspace_;
   size_t backward_workspace_;
   size_t forward_workspace_byte_;
@@ -364,12 +584,10 @@ class CuDNNDeconvolutionOp : public Operator {
   cudnnConvolutionFwdAlgo_t algo_;
   cudnnConvolutionBwdDataAlgo_t back_algo_;
   cudnnConvolutionBwdFilterAlgo_t back_algo_w_;
-  #if CUDNN_MAJOR == 5
   cudnnTensorFormat_t format_;
-  #endif
   DeconvolutionParam param_;
 };
-#endif  // __CUDACC__ && CUDNN
+#endif  // CUDNN
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
old mode 100644
new mode 100755
index d1f440fa5439..52eb1dac04e5
--- a/src/operator/cudnn_lrn-inl.h
+++ b/src/operator/cudnn_lrn-inl.h
@@ -12,13 +12,13 @@
 
 namespace mxnet {
 namespace op {
+template<typename DType>
 class CuDNNLocalResponseNormOp : public Operator {
  public:
   explicit CuDNNLocalResponseNormOp(LRNParam param) {
     param_ = param;
     init_cudnn_ = false;
-    // TODO(xxx): fp16
-    dtype_ = CUDNN_DATA_FLOAT;
+    dtype_ = mshadow::DataType<DType>::kCudnnFlag;
   }
 
   ~CuDNNLocalResponseNormOp() {
@@ -35,13 +35,13 @@ class CuDNNLocalResponseNormOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
-    float alpha = 1.0f;
-    float beta = 0.0f;
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> out = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4, DType> data = in_data[lrn_enum::kData].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> out = out_data[lrn_enum::kOut].get<gpu, 4, DType>(s);
     if (!init_cudnn_) {
       this->Init(s, in_data, out_data);
     }
@@ -66,18 +66,18 @@ class CuDNNLocalResponseNormOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
-    float alpha = 1.0f;
-    float beta = 0.0f;
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
+    typename DataType<DType>::ScaleType alpha = 1.0f;
+    typename DataType<DType>::ScaleType beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
-    Tensor<gpu, 4> grad = out_grad[lrn_enum::kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> output_data = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
-    Tensor<gpu, 4> input_grad = in_grad[lrn_enum::kData].get<gpu, 4, real_t>(s);
+    Tensor<gpu, 4, DType> grad = out_grad[lrn_enum::kOut].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> data = in_data[lrn_enum::kData].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> output_data = out_data[lrn_enum::kOut].get<gpu, 4, DType>(s);
+    Tensor<gpu, 4, DType> input_grad = in_grad[lrn_enum::kData].get<gpu, 4, DType>(s);
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     CHECK_EQ(cudnnLRNCrossChannelBackward(s->dnn_handle_,
                                           lrn_desc_,
@@ -99,12 +99,12 @@ class CuDNNLocalResponseNormOp : public Operator {
                    const std::vector<TBlob> &in_data,
                    const std::vector<TBlob> &out_data) {
     using namespace mshadow;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
     if (!init_cudnn_) {
       init_cudnn_ = true;
-      Tensor<gpu, 4> data = in_data[lrn_enum::kData].get<gpu, 4, real_t>(s);
-      Tensor<gpu, 4> out = out_data[lrn_enum::kOut].get<gpu, 4, real_t>(s);
+      Tensor<gpu, 4, DType> data = in_data[lrn_enum::kData].get<gpu, 4, DType>(s);
+      Tensor<gpu, 4, DType> out = out_data[lrn_enum::kOut].get<gpu, 4, DType>(s);
       unsigned lrn_n = param_.nsize;
       double alpha = param_.alpha;
       double beta = param_.beta;
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index 220328573250..6c7dbc134761 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -49,8 +49,8 @@ class CuDNNPoolingOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
     typename DataType<DType>::ScaleType alpha = 1.0f;
@@ -103,11 +103,11 @@ class CuDNNPoolingOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
 
     Stream<gpu> *s = ctx.get_stream<gpu>();
     CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
@@ -162,8 +162,8 @@ class CuDNNPoolingOp : public Operator {
     #if CUDNN_MAJOR == 5
     nan_prop_ = CUDNN_NOT_PROPAGATE_NAN;
     #endif
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       if (param_.kernel.ndim() == 2) {
diff --git a/src/operator/cudnn_softmax_activation-inl.h b/src/operator/cudnn_softmax_activation-inl.h
index ae6d9b535364..d44d08394126 100644
--- a/src/operator/cudnn_softmax_activation-inl.h
+++ b/src/operator/cudnn_softmax_activation-inl.h
@@ -34,8 +34,8 @@ class CuDNNSoftmaxActivationOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4> data;
     Tensor<gpu, 4> out;
@@ -100,10 +100,10 @@ class CuDNNSoftmaxActivationOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     float alpha = 1.0f;
     float beta = 0.0f;
     Stream<gpu> *s = ctx.get_stream<gpu>();
diff --git a/src/operator/cudnn_spatial_transformer-inl.h b/src/operator/cudnn_spatial_transformer-inl.h
index 12e30b603582..6d43c3ec1d5f 100644
--- a/src/operator/cudnn_spatial_transformer-inl.h
+++ b/src/operator/cudnn_spatial_transformer-inl.h
@@ -39,8 +39,8 @@ class CuDNNSpatialTransformerOp : public Operator {
                        const std::vector<TBlob> &out_data,
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 3U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> out = out_data[st::kOut].get<gpu, 4, DType>(s);
@@ -81,9 +81,9 @@ class CuDNNSpatialTransformerOp : public Operator {
                         const std::vector<TBlob> &in_grad,
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 3);
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 3U);
+    CHECK_EQ(out_grad.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
     Tensor<gpu, 4, DType> grad = out_grad[st::kOut].get<gpu, 4, DType>(s);
@@ -129,8 +129,8 @@ class CuDNNSpatialTransformerOp : public Operator {
     #if CUDNN_MAJOR == 5
     format_ = CUDNN_TENSOR_NCHW;
     #endif
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 3U);
     if (!init_cudnn_) {
       init_cudnn_ = true;
       Tensor<gpu, 4, DType> data = in_data[st::kData].get<gpu, 4, DType>(s);
diff --git a/src/operator/custom-inl.h b/src/operator/custom/custom-inl.h
similarity index 71%
rename from src/operator/custom-inl.h
rename to src/operator/custom/custom-inl.h
index 66dd59b43b32..b9224cd30f48 100644
--- a/src/operator/custom-inl.h
+++ b/src/operator/custom/custom-inl.h
@@ -5,8 +5,8 @@
  * \author Junyuan Xie
 */
 
-#ifndef MXNET_OPERATOR_CUSTOM_INL_H_
-#define MXNET_OPERATOR_CUSTOM_INL_H_
+#ifndef MXNET_OPERATOR_CUSTOM_CUSTOM_INL_H_
+#define MXNET_OPERATOR_CUSTOM_CUSTOM_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -21,7 +21,7 @@
 #include <functional>
 #include <condition_variable>
 #include <queue>
-#include "./operator_common.h"
+#include "../operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -34,9 +34,10 @@ struct CustomOpParam {
 template<typename xpu>
 class CustomOp : public Operator {
  public:
-  explicit CustomOp(CustomOpInfo* op_info) {
-    op_info_.reset(op_info, [](CustomOpInfo *ptr){
-        ptr->del(ptr->p_del);
+  explicit CustomOp(MXCallbackList* op_info) {
+    op_info_.reset(op_info, [](MXCallbackList *ptr){
+        reinterpret_cast<CustomOpDelFunc>(ptr->callbacks[kCustomOpDelete])(
+          ptr->contexts[kCustomOpDelete]);
         delete ptr;
       });
     if (std::string("NaiveEngine") == dmlc::GetEnv("MXNET_ENGINE_TYPE", std::string())) {
@@ -88,7 +89,7 @@ class CustomOp : public Operator {
 
  private:
   Context get_ctx();
-  std::shared_ptr<CustomOpInfo> op_info_;
+  std::shared_ptr<MXCallbackList> op_info_;
   std::mutex mtx_;
   std::condition_variable cv_;
   std::thread worker_;
@@ -98,7 +99,7 @@ class CustomOp : public Operator {
 };  // CustomOp
 
 template<typename xpu>
-Operator* CreateOp(CustomOpInfo *op_info);
+Operator* CreateOp(MXCallbackList *op_info);
 
 class CustomOpProp : public OperatorProperty {
  public:
@@ -127,7 +128,11 @@ class CustomOpProp : public OperatorProperty {
     CHECK(registry_.find(param_.op_type) != registry_.end())
       << "Cannot find custom operator type " << param_.op_type;
     CustomOpPropCreator creator = registry_[param_.op_type];
-    info_.reset(new CustomOpPropInfo, [](CustomOpPropInfo* ptr){ptr->del(ptr->p_del);});
+    info_.reset(new MXCallbackList, [](MXCallbackList* ptr){
+        reinterpret_cast<CustomOpDelFunc>(ptr->callbacks[kCustomOpPropDelete])(
+          ptr->contexts[kCustomOpPropDelete]);
+        delete ptr;
+      });
     CHECK(creator(param_.op_type.c_str(), keys.size(), keys.data(), vals.data(), info_.get()));
     num_inputs_ = ListArguments().size();
     num_outputs_ = ListOutputs().size();
@@ -136,7 +141,8 @@ class CustomOpProp : public OperatorProperty {
 
   std::vector<std::string> ListArguments() const override {
     char ** args = NULL;
-    CHECK(info_->list_arguments(&args, info_->p_list_arguments));
+    CHECK(reinterpret_cast<CustomOpListFunc>(info_->callbacks[kCustomOpPropListArguments])(
+      &args, info_->contexts[kCustomOpPropListArguments]));
     std::vector<std::string> ret;
     for (int i = 0; args[i] != NULL; ++i) {
       ret.push_back(args[i]);
@@ -146,7 +152,8 @@ class CustomOpProp : public OperatorProperty {
 
   std::vector<std::string> ListOutputs() const override {
     char ** args = NULL;
-    CHECK(info_->list_outputs(&args, info_->p_list_outputs));
+    CHECK(reinterpret_cast<CustomOpListFunc>(info_->callbacks[kCustomOpPropListOutputs])(
+      &args, info_->contexts[kCustomOpPropListOutputs]));
     std::vector<std::string> ret;
     for (int i = 0; args[i] != NULL; ++i) {
       ret.push_back(args[i]);
@@ -156,7 +163,8 @@ class CustomOpProp : public OperatorProperty {
 
   std::vector<std::string> ListAuxiliaryStates() const override {
     char ** args = NULL;
-    CHECK(info_->list_auxiliary_states(&args, info_->p_list_auxiliary_states));
+    CHECK(reinterpret_cast<CustomOpListFunc>(info_->callbacks[kCustomOpPropListAuxiliaryStates])(
+      &args, info_->contexts[kCustomOpPropListAuxiliaryStates]));
     std::vector<std::string> ret;
     for (int i = 0; args[i] != NULL; ++i) {
       ret.push_back(args[i]);
@@ -184,7 +192,9 @@ class CustomOpProp : public OperatorProperty {
     }
     shapes.resize(num_inputs_+num_outputs_+num_auxs_);
     ndims.resize(num_inputs_+num_outputs_+num_auxs_);
-    CHECK(info_->infer_shape(shapes.size(), ndims.data(), shapes.data(), info_->p_infer_shape));
+
+    CHECK(reinterpret_cast<CustomOpInferShapeFunc>(info_->callbacks[kCustomOpPropInferShape])(
+      shapes.size(), ndims.data(), shapes.data(), info_->contexts[kCustomOpPropInferShape]));
     for (unsigned i = 0; i < in_shape->size(); ++i) {
       SHAPE_ASSIGN_CHECK(*in_shape, i, TShape(shapes[i], shapes[i]+ndims[i]));
     }
@@ -199,6 +209,33 @@ class CustomOpProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    if (info_->num_callbacks <= kCustomOpPropInferType) {
+      return OperatorProperty::InferType(in_type, out_type, aux_type);
+    }
+
+    std::vector<int> types;
+    for (const auto &i : *in_type) types.push_back(i);
+    for (const auto &i : *out_type) types.push_back(i);
+    for (const auto &i : *aux_type) types.push_back(i);
+
+    CHECK(reinterpret_cast<CustomOpInferTypeFunc>(info_->callbacks[kCustomOpPropInferType])(
+      types.size(), types.data(), info_->contexts[kCustomOpPropInferType]));
+    for (unsigned i = 0; i < num_inputs_; ++i) {
+      TYPE_ASSIGN_CHECK(*in_type, i, types[i]);
+    }
+    for (unsigned i = 0; i < num_outputs_; ++i) {
+      TYPE_ASSIGN_CHECK(*out_type, i, types[i+num_inputs_]);
+    }
+    for (unsigned i = 0; i < num_auxs_; ++i) {
+      TYPE_ASSIGN_CHECK(*aux_type, i, types[i+num_inputs_+num_outputs_]);
+    }
+    return true;
+  }
+
+
   OperatorProperty* Copy() const override {
     CustomOpProp *prop_sym = new CustomOpProp();
     prop_sym->Init(kwargs_);
@@ -215,9 +252,10 @@ class CustomOpProp : public OperatorProperty {
     const std::vector<int> &out_data) const override {
     int num_dep;
     int *rdeps;
-    CHECK(info_->declare_backward_dependency(out_grad.data(), in_data.data(),
-                                             out_data.data(), &num_dep, &rdeps,
-                                             info_->p_declare_backward_dependency));
+    CHECK(reinterpret_cast<CustomOpBwdDepFunc>(
+      info_->callbacks[kCustomOpPropDeclareBackwardDependency])(
+        out_grad.data(), in_data.data(), out_data.data(), &num_dep,
+        &rdeps, info_->contexts[kCustomOpPropDeclareBackwardDependency]));
     std::vector<int> deps;
     deps.insert(deps.end(), rdeps, rdeps+num_dep);
     return deps;
@@ -243,10 +281,10 @@ class CustomOpProp : public OperatorProperty {
   static std::map<std::string, CustomOpPropCreator> registry_;
 
   CustomOpParam param_;
-  std::shared_ptr<CustomOpPropInfo> info_;
+  std::shared_ptr<MXCallbackList> info_;
   std::vector<std::pair<std::string, std::string> > kwargs_;
   unsigned num_inputs_, num_outputs_, num_auxs_;
 };  // class CustomOpProp
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_CUSTOM_INL_H_
+#endif  // MXNET_OPERATOR_CUSTOM_CUSTOM_INL_H_
diff --git a/src/operator/custom.cc b/src/operator/custom/custom.cc
similarity index 86%
rename from src/operator/custom.cc
rename to src/operator/custom/custom.cc
index 007e8b1241a3..e0dab6d81783 100644
--- a/src/operator/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -18,7 +18,7 @@ Context CustomOp<cpu>::get_ctx() {
 }
 
 template<>
-Operator *CreateOp<cpu>(CustomOpInfo *op_info) {
+Operator *CreateOp<cpu>(MXCallbackList *op_info) {
   return new CustomOp<cpu>(op_info);
 }
 
@@ -31,7 +31,7 @@ Context CustomOp<gpu>::get_ctx() {
 }
 
 template<>
-Operator* CreateOp<gpu>(CustomOpInfo *op_info) {
+Operator* CreateOp<gpu>(MXCallbackList *op_info) {
   return new CustomOp<gpu>(op_info);
 }
 #endif  // MXNET_USE_CUDA
@@ -72,12 +72,9 @@ void CustomOp<xpu>::Forward(const OpContext &ctx,
   ndvar.resize(std::unique(ndvar.begin(), ndvar.end()) - ndvar.begin());
 
   auto compute = [=]() mutable {
-      CHECK(
-        op_info_->forward(ptrs.size(),
-        ptrs.data(), tags.data(),
-        reqs.data(),
-        ctx.is_train,
-        op_info_->p_forward));
+      CHECK(reinterpret_cast<CustomOpFBFunc>(op_info_->callbacks[kCustomOpForward])(
+        ptrs.size(), ptrs.data(), tags.data(), reqs.data(),
+        static_cast<int>(ctx.is_train), op_info_->contexts[kCustomOpForward]));
 
       // NDArray* in ptrs is freed by frontend side. We keep a copy in ndcpy to keep ndvar alive
       Engine::Get()->PushSync([ndcpy, ctx](RunContext rctx) {
@@ -141,13 +138,10 @@ void CustomOp<xpu>::Backward(const OpContext &ctx,
   }
 
   auto compute = [=]() mutable {
-      CHECK(
-        op_info_->backward(ptrs.size(),
-        ptrs.data(),
-        tags.data(),
-        reqs.data(),
-        true,
-        op_info_->p_backward));
+      CHECK(reinterpret_cast<CustomOpFBFunc>(op_info_->callbacks[kCustomOpBackward])(
+        ptrs.size(), ptrs.data(), tags.data(), reqs.data(), 1,
+        op_info_->contexts[kCustomOpBackward]));
+
       // NDArray* in ptrs is freed by frontend side. We keep a copy in ndcpy to keep ndvar alive
       Engine::Get()->PushSync([ndcpy, ctx](RunContext rctx){
           ctx.async_on_complete();
@@ -178,9 +172,11 @@ Operator* CustomOpProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_sh
   } else {
     str_ctx = "gpu";
   }
-  CustomOpInfo *op_info = new CustomOpInfo;
-  CHECK(info_->create_operator(str_ctx.c_str(), shapes.size(), shapes.data(),
-                              ndims.data(), in_type->data(), op_info, info_->p_create_operator));
+  MXCallbackList *op_info = new MXCallbackList;
+
+  CHECK(reinterpret_cast<CustomOpCreateFunc>(info_->callbacks[kCustomOpPropCreateOperator])(
+    str_ctx.c_str(), shapes.size(), shapes.data(), ndims.data(), in_type->data(), op_info,
+    info_->contexts[kCustomOpPropCreateOperator]));
   DO_BIND_DISPATCH(CreateOp, op_info);
 }
 
diff --git a/src/operator/native_op-inl.h b/src/operator/custom/native_op-inl.h
similarity index 97%
rename from src/operator/native_op-inl.h
rename to src/operator/custom/native_op-inl.h
index 6ac7f6b4e92d..b5706205c82b 100644
--- a/src/operator/native_op-inl.h
+++ b/src/operator/custom/native_op-inl.h
@@ -5,8 +5,8 @@
  * \author Junyuan Xie
 */
 
-#ifndef MXNET_OPERATOR_NATIVE_OP_INL_H_
-#define MXNET_OPERATOR_NATIVE_OP_INL_H_
+#ifndef MXNET_OPERATOR_CUSTOM_NATIVE_OP_INL_H_
+#define MXNET_OPERATOR_CUSTOM_NATIVE_OP_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -16,7 +16,7 @@
 #include <string>
 #include <utility>
 #include <sstream>
-#include "./operator_common.h"
+#include "../operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -257,4 +257,4 @@ class NativeOpProp : public OperatorProperty {
 #endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_NATIVE_OP_INL_H_
+#endif  // MXNET_OPERATOR_CUSTOM_NATIVE_OP_INL_H_
diff --git a/src/operator/native_op.cc b/src/operator/custom/native_op.cc
similarity index 100%
rename from src/operator/native_op.cc
rename to src/operator/custom/native_op.cc
diff --git a/src/operator/native_op.cu b/src/operator/custom/native_op.cu
similarity index 100%
rename from src/operator/native_op.cu
rename to src/operator/custom/native_op.cu
diff --git a/src/operator/ndarray_op-inl.h b/src/operator/custom/ndarray_op-inl.h
similarity index 96%
rename from src/operator/ndarray_op-inl.h
rename to src/operator/custom/ndarray_op-inl.h
index 30cc2db8af75..a07a7f781d2d 100644
--- a/src/operator/ndarray_op-inl.h
+++ b/src/operator/custom/ndarray_op-inl.h
@@ -5,8 +5,8 @@
  * \author Junyuan Xie
 */
 
-#ifndef MXNET_OPERATOR_NDARRAY_OP_INL_H_
-#define MXNET_OPERATOR_NDARRAY_OP_INL_H_
+#ifndef MXNET_OPERATOR_CUSTOM_NDARRAY_OP_INL_H_
+#define MXNET_OPERATOR_CUSTOM_NDARRAY_OP_INL_H_
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -16,7 +16,7 @@
 #include <string>
 #include <utility>
 #include <sstream>
-#include "./operator_common.h"
+#include "../operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -170,4 +170,4 @@ class NDArrayOpProp : public OperatorProperty {
 #endif  // DMLC_USE_CXX11
 }  // namespace op
 }  // namespace mxnet
-#endif  // MXNET_OPERATOR_NDARRAY_OP_INL_H_
+#endif  // MXNET_OPERATOR_CUSTOM_NDARRAY_OP_INL_H_
diff --git a/src/operator/ndarray_op.cc b/src/operator/custom/ndarray_op.cc
similarity index 100%
rename from src/operator/ndarray_op.cc
rename to src/operator/custom/ndarray_op.cc
diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
index 032970597da7..991937bd038f 100644
--- a/src/operator/deconvolution-inl.h
+++ b/src/operator/deconvolution-inl.h
@@ -25,6 +25,7 @@ namespace deconv {
   enum DeconvolutionOpInputs {kData, kWeight, kBias};
   enum DeconvolutionOpOutputs {kOut};
   enum DeconvolutionOpResource {kTempSpace};
+  enum DeconvolutionOpCudnnTune {kOff, kLimited, kFastest};
 }
 
 struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
@@ -37,54 +38,69 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   uint32_t num_group;
   uint64_t workspace;
   bool no_bias;
+  dmlc::optional<int> cudnn_tune;
+  dmlc::optional<int> layout;
   DMLC_DECLARE_PARAMETER(DeconvolutionParam) {
-    int shape[] = {1, 1};
-    DMLC_DECLARE_FIELD(kernel).describe("deconvolution kernel size: (y, x)");
-    DMLC_DECLARE_FIELD(stride).set_default(TShape(shape, shape + 2))
-        .describe("deconvolution stride: (y, x)");
-    shape[0] = shape[1] = 0;
-    DMLC_DECLARE_FIELD(pad).set_default(TShape(shape, shape + 2))
-        .describe("pad for deconvolution: (y, x), a good number is : (kernel-1)/2, "
-                  "if target_shape set, pad will be ignored and will be computed "
-                  "automatically");
-    DMLC_DECLARE_FIELD(adj).set_default(TShape(shape, shape + 2))
-        .describe("adjustment for output shape: (y, x), if target_shape set, adj "
-                  "will be ignored and will be computed automatically");
-    DMLC_DECLARE_FIELD(target_shape).set_default(TShape(shape, shape + 2))
-        .describe("output shape with targe shape : (y, x)");
+    DMLC_DECLARE_FIELD(kernel).describe("deconvolution kernel size: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
+        .describe("deconvolution stride: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
+        .describe("pad for deconvolution: (h, w) or (d, h, w). "
+                  "A good number is : (kernel-1)/2. "
+                  "If target_shape is set, "
+                  "pad will be ignored and computed accordingly");
+    DMLC_DECLARE_FIELD(adj).set_default(TShape())
+        .describe("adjustment for output shape: (h, w) or (d, h, w). "
+                  "If target_shape is set, "
+                  "ad will be ignored and computed accordingly");
+    DMLC_DECLARE_FIELD(target_shape).set_default(TShape())
+        .describe("output shape with target shape : (h, w) or (d, h, w)");
     DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
         .describe("deconvolution filter(channel) number");
     DMLC_DECLARE_FIELD(num_group).set_default(1)
         .describe("number of groups partition");
     DMLC_DECLARE_FIELD(workspace).set_default(512).set_range(0, 8192)
-        .describe("Tmp workspace for deconvolution (MB)");
+      .describe("Maximum temporal workspace allowed for deconvolution (MB).");
     DMLC_DECLARE_FIELD(no_bias).set_default(true)
         .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(cudnn_tune)
+      .add_enum("off", deconv::kOff)
+      .add_enum("limited_workspace", deconv::kLimited)
+      .add_enum("fastest", deconv::kFastest)
+      .set_default(dmlc::optional<int>())
+      .describe("Whether to pick convolution algo by running performance test.");
+    DMLC_DECLARE_FIELD(layout)
+      .add_enum("NCW", mshadow::kNCW)
+      .add_enum("NCHW", mshadow::kNCHW)
+      .add_enum("NCDHW", mshadow::kNCDHW)
+      .add_enum("NHWC", mshadow::kNHWC)
+      .add_enum("NDHWC", mshadow::kNDHWC)
+      .set_default(dmlc::optional<int>())
+      .describe("Set layout for input, output and weight. Empty for\n    "
+                "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
   }
 
-  inline void InferPad(index_t input_y, index_t input_x,
-                       index_t* o_pad_y, index_t* o_pad_x,
-                       index_t* o_adj_y, index_t* o_adj_x) const {
-    index_t& pad_y = *o_pad_y;
-    index_t& pad_x = *o_pad_x;
-    index_t& adj_y = *o_adj_y;
-    index_t& adj_x = *o_adj_x;
-    if (target_shape[0] != 0 || target_shape[1] != 0) {
-      pad_y = stride[0] * (input_y - 1) + kernel[0];
-      pad_x = stride[1] * (input_x - 1) + kernel[1];
-      CHECK_GE(pad_y, target_shape[0])
-          << "too big target shape";
-      CHECK_GE(pad_x, target_shape[1])
+  template<size_t ndim>
+  void InferPad(TShape input, index_t (&o_pad)[ndim], index_t (&o_adj)[ndim] ) const {
+    if (target_shape.ndim() != 0) {
+      size_t input_ndim = input.ndim();
+
+      for (unsigned int i = 0; i < ndim; i++) {
+        // input.ndim() can be larger than ndim, in case that the complete input
+        // shape was passed and not only the ndim last ones
+        o_pad[i] = stride[i] * (input[(input_ndim - ndim) + i] - 1) + kernel[i];
+
+        CHECK_GE(o_pad[i], target_shape[i])
           << "too big target shape";
-      pad_y -= target_shape[0];
-      pad_x -= target_shape[1];
-      adj_y = pad_y % 2; pad_y = (pad_y + 1) / 2;
-      adj_x = pad_x % 2; pad_x = (pad_x + 1) / 2;
+
+        o_pad[i] -= target_shape[i];
+        o_adj[i] = o_pad[i] % 2; o_pad[i] = (o_pad[i] + 1) / 2;
+      }
     } else {
-      pad_y = pad[0];
-      pad_x = pad[1];
-      adj_y = adj[0];
-      adj_x = adj[1];
+      for (unsigned int i = 0; i < ndim; i++) {
+        o_pad[i] = pad[i];
+        o_adj[i] = adj[i];
+      }
     }
   }
 };
@@ -105,16 +121,22 @@ class DeconvolutionOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
+
+    if (param_.kernel.ndim() != 2) {
+      LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported";
+    }
+
     CHECK_EQ(req[deconv::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> data = in_data[deconv::kData].get<xpu, 4, DType>(s);
     Tensor<xpu, 4, DType> out = out_data[deconv::kOut].get<xpu, 4, DType>(s);
 
-    index_t pad_y, pad_x, adj_y, adj_x;
-    param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
+    index_t o_pad[2], o_adj[2];
+    TShape dshape = {data.size(2), data.size(3)};
+    param_.InferPad(dshape, o_pad, o_adj);
 
     Shape<3> wmat_shape =
         Shape3(param_.num_group,
@@ -142,7 +164,7 @@ class DeconvolutionOp : public Operator {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (pad_y == 0 && pad_x == 0) {
+      if (o_pad[0] == 0 && o_pad[1] == 0) {
         temp_col = unpack_patch2col(out.Slice(i, i + step),
                                     param_.kernel[0],
                                     param_.kernel[1],
@@ -151,7 +173,7 @@ class DeconvolutionOp : public Operator {
                                     1, 1);  // Deconvolution only support dilate equals 1
       } else {
         temp_col = unpack_patch2col(pad(out.Slice(i, i + step),
-                                        pad_y, pad_x),
+                                        o_pad[0], o_pad[1]),
                                     param_.kernel[0],
                                     param_.kernel[1],
                                     param_.stride[0],
@@ -164,7 +186,7 @@ class DeconvolutionOp : public Operator {
                                               gstride * (gid + 1));
         tmpc = dot(wmat[gid].T(), temp_dst[gid]);
       }
-      if (pad_y == 0 && pad_x == 0) {
+      if (o_pad[0] == 0 && o_pad[1] == 0) {
         out.Slice(i, i + step) = pack_col2patch(temp_col,
                                    out.Slice(i, i + step).shape_,
                                    param_.kernel[0],
@@ -175,8 +197,8 @@ class DeconvolutionOp : public Operator {
                                    1);  // Deconvolution only support dilate equals 1
       } else {
         Shape<4> pshape = out.Slice(i, i + step).shape_;
-        pshape[2] += 2 * pad_y;
-        pshape[3] += 2 * pad_x;
+        pshape[2] += 2 * o_pad[0];
+        pshape[3] += 2 * o_pad[1];
         out.Slice(i, i + step) = crop(pack_col2patch(temp_col,
                                         pshape,
                                         param_.kernel[0],
@@ -205,7 +227,7 @@ class DeconvolutionOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(bing): check the BLAS Handle, be careful
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias == 0 ? 3 : 2;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
@@ -227,8 +249,9 @@ class DeconvolutionOp : public Operator {
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
         << "Must init CuBLAS handle in stream";
 #endif
-    index_t pad_y, pad_x, adj_y, adj_x;
-    param_.InferPad(data.size(2), data.size(3), &pad_y, &pad_x, &adj_y, &adj_x);
+    index_t o_pad[2], o_adj[2];
+    TShape dshape = {data.size(2), data.size(3)};
+    param_.InferPad(dshape, o_pad, o_adj);
 
     const index_t nbatch = data.size(0);
     Tensor<xpu, 1, DType> workspace =
@@ -246,7 +269,7 @@ class DeconvolutionOp : public Operator {
                                            shape_dstunit_[1],
                                            shape_dstunit_[2] * step), s);
       temp_dst = reshape(swapaxis<1, 0>(data.Slice(i, i + step)), temp_dst.shape_);
-      if (pad_y == 0 && pad_x == 0) {
+      if (o_pad[0] == 0 && o_pad[1] == 0) {
         temp_col = unpack_patch2col(grad.Slice(i, i + step),
                                      param_.kernel[0],
                                      param_.kernel[1],
@@ -254,7 +277,7 @@ class DeconvolutionOp : public Operator {
                                      param_.stride[1],
                                      1, 1);  // Deconvolution only support dilate equals 1
       } else {
-        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), pad_y, pad_x),
+        temp_col = unpack_patch2col(pad(grad.Slice(i, i + step), o_pad[0], o_pad[1]),
                                      param_.kernel[0],
                                      param_.kernel[1],
                                      param_.stride[0],
@@ -329,7 +352,10 @@ class DeconvolutionOp : public Operator {
 };  // class DeconvolutionOp
 
 template<typename xpu>
-Operator* CreateOp(DeconvolutionParam param, int dtype);
+Operator* CreateOp(DeconvolutionParam param, int dtype,
+                   std::vector<TShape> *in_shape,
+                   std::vector<TShape> *out_shape,
+                   Context ctx);
 
 #if DMLC_USE_CXX11
 class DeconvolutionProp : public OperatorProperty {
@@ -343,7 +369,25 @@ class DeconvolutionProp : public OperatorProperty {
   }
 
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
     param_.Init(kwargs);
+    if (param_.kernel.ndim() == 1) {
+      param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+      if (param_.adj.ndim() == 0) param_.adj = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+      if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported";
+      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+      if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0);
+    }
   }
 
   std::map<std::string, std::string> GetParams() const override {
@@ -353,60 +397,180 @@ class DeconvolutionProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
+#if MXNET_USE_CUDNN == 0
+    if (param_.kernel.ndim() != 2) {
+      LOG(FATAL) << "If not using CUDNN only 2D-Deconvolution is supported";
+      return false;
+    }
+#endif  // CUDNN
+
     using namespace mshadow;
     if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
     } else {
-      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
     }
+    out_shape->resize(1, TShape());
     const TShape &dshape = (*in_shape)[deconv::kData];
     if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 4) \
+
+    if (param_.kernel.ndim() == 1) {
+      CHECK_EQ(dshape.ndim(), 3) \
+        << "Input data should be 3D in batch-num_filter-x";
+      Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW);
+      Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group,
+                               param_.kernel[0]);
+      wshape = ConvertLayout(wshape, kNCW, param_.layout.value());
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[0]);
+
+      index_t o_pad[1];
+      index_t o_adj[1];
+      param_.InferPad(dshape_ncw, o_pad, o_adj);
+
+      CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+
+      CHECK_GE(ksize_x-1, o_adj[0]) << "adj(x) must be samller than kernel(w)";
+
+      Shape<3> oshape;
+      oshape[0] = dshape_ncw[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + ksize_x - 2 * o_pad[0] + o_adj[0];
+
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please it carefully";
+      }
+
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+
+      return true;
+    } else if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4U) \
         << "Input data should be 4D in batch-num_filter-y-x";
-    SHAPE_ASSIGN_CHECK(*in_shape,
-                       deconv::kWeight,
-                       Shape4(dshape[1], param_.num_filter / param_.num_group,
-                              param_.kernel[0], param_.kernel[1]));
-    if (!param_.no_bias) {
-      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    // osize = stride * (isize - 1) + ksize - 2 * pad + adj
-    const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
-    const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-    index_t pad_y, pad_x, adj_y, adj_x;
-    param_.InferPad(dshape[2], dshape[3], &pad_y, &pad_x, &adj_y, &adj_x);
-    CHECK_EQ(dshape[1] % param_.num_group, 0) \
+      Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> wshape = Shape4(dshape_nchw[1], param_.num_filter / param_.num_group,
+                               param_.kernel[0], param_.kernel[1]);
+      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+
+      index_t o_pad[2];
+      index_t o_adj[2];
+      param_.InferPad(dshape_nchw, o_pad, o_adj);
+
+      CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \
         << "input num_filter must divide group size";
-    CHECK_EQ(param_.num_filter % param_.num_group, 0) \
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
         << "output num_filter must divide group size";
-    CHECK_GT(param_.kernel.Size(), 0) \
+      CHECK_GT(param_.kernel.Size(), 0U) \
         << "incorrect kernel size: " << param_.kernel;
-    CHECK_GT(param_.stride.Size(), 0) \
+      CHECK_GT(param_.stride.Size(), 0U) \
         << "incorrect stride size: " << param_.stride;
-    CHECK_GE(ksize_y-1, adj_y) << "adj(y) must be samller than kernel(h)";
-    CHECK_GE(ksize_x-1, adj_x) << "adj(x) must be samller than kernel(w)";
-    (*out_shape)[deconv::kOut][1] = param_.num_filter;
-    (*out_shape)[deconv::kOut][2] = param_.stride[0] * (dshape[2] - 1) +
-        ksize_y - 2 * pad_y + adj_y;
-    (*out_shape)[deconv::kOut][3] = param_.stride[1] * (dshape[3] - 1) +
-        ksize_x - 2 * pad_x + adj_x;
-    if (param_.target_shape[0] > 0) {
-      CHECK_EQ(param_.target_shape[0], (*out_shape)[deconv::kOut][2]) \
-          << "param_.target_shape[0] was not reasonable, pelase set it carefully";
-    }
-    if (param_.target_shape[1] > 0) {
-      CHECK_EQ(param_.target_shape[1], (*out_shape)[deconv::kOut][3]) \
-          << "param_.target_shape[1] was not reasonable, pelase set it carefully";
+
+      CHECK_GE(ksize_y-1, o_adj[0]) << "adj(y) must be samller than kernel(h)";
+      CHECK_GE(ksize_x-1, o_adj[1]) << "adj(x) must be samller than kernel(w)";
+
+      Shape<4> oshape;
+      oshape[0] = dshape_nchw[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + ksize_y - 2 * o_pad[0] + o_adj[0];
+      oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + ksize_x - 2 * o_pad[1] + o_adj[1];
+
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+
+      return true;
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5U) \
+        << "Input data should be 5D in batch-num_filter-depth-y-x";
+      Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW);
+      Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group,
+                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
+      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
+      SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape);
+      if (!param_.no_bias) {
+        SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter));
+      }
+
+      const index_t ksize_d = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[1]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[2]);
+
+      index_t o_pad[3];
+      index_t o_adj[3];
+      param_.InferPad(dshape_ncdhw, o_pad, o_adj);
+
+      CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+
+      CHECK_GE(ksize_d-1, o_adj[0]) << "adj(d) must be samller than kernel(d)";
+      CHECK_GE(ksize_y-1, o_adj[1]) << "adj(y) must be samller than kernel(h)";
+      CHECK_GE(ksize_x-1, o_adj[2]) << "adj(x) must be samller than kernel(w)";
+
+      Shape<5> oshape;
+      oshape[0] = dshape_ncdhw[0];
+      oshape[1] = param_.num_filter;
+      oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + ksize_d - 2 * o_pad[0] + o_adj[0];
+      oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + ksize_y - 2 * o_pad[1] + o_adj[1];
+      oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + ksize_x - 2 * o_pad[2] + o_adj[2];
+
+      if (param_.target_shape[0] > 0) {
+        CHECK_EQ(param_.target_shape[0], oshape[2]) \
+          << "param_.target_shape[0] was not reasonable, please it carefully";
+      }
+      if (param_.target_shape[1] > 0) {
+        CHECK_EQ(param_.target_shape[1], oshape[3]) \
+          << "param_.target_shape[1] was not reasonable, please set it carefully";
+      }
+      if (param_.target_shape[2] > 0) {
+        CHECK_EQ(param_.target_shape[2], oshape[4]) \
+          << "param_.target_shape[2] was not reasonable, please set it carefully";
+      }
+
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
+
+      return true;
+    } else {
+      LOG(FATAL) << "Unknown convolution type";
+      return false;
     }
-    return true;
   }
 
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
+    CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/deconvolution.cc b/src/operator/deconvolution.cc
index 61d839bae8d3..5b2d065667a9 100644
--- a/src/operator/deconvolution.cc
+++ b/src/operator/deconvolution.cc
@@ -10,7 +10,10 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(DeconvolutionParam param, int dtype) {
+Operator* CreateOp<cpu>(DeconvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     op = new DeconvolutionOp<cpu, DType>(param);
@@ -24,7 +27,7 @@ Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector<TShape> *
   std::vector<int> out_type, aux_type;
   CHECK(InferType(in_type, &out_type, &aux_type));
   CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx);
 }
 
 DMLC_REGISTER_PARAMETER(DeconvolutionParam);
diff --git a/src/operator/deconvolution.cu b/src/operator/deconvolution.cu
index eb9c78b1d5a8..a670bc089739 100644
--- a/src/operator/deconvolution.cu
+++ b/src/operator/deconvolution.cu
@@ -13,11 +13,14 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype) {
+Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape,
+                        Context ctx) {
   Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new CuDNNDeconvolutionOp<DType>(param);
+    op = new CuDNNDeconvolutionOp<DType>(param, *in_shape, *out_shape, ctx);
   });
 #else
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index d302d4c19512..21de248b7c2c 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -18,9 +18,8 @@
 #include "./operator_common.h"
 #include "./mshadow_op.h"
 
-#if defined(USE_STATIC_MKL) && defined(_OPENMP)
+#if defined(USE_MKL) && defined(_OPENMP)
 #include <omp.h>
-#include <sched.h>
 
 #include <mkl_vml_functions.h>
 #include <mkl_vsl.h>
@@ -35,9 +34,9 @@ enum DropoutOpForwardResource {kRandom};
 namespace mxnet {
 namespace op {
 
-#if defined(USE_STATIC_MKL) && defined(_OPENMP)
+#if defined(USE_MKL) && defined(_OPENMP)
 static void bernoulli_generate(int n, double p, int* r) {
-  int seed = 17 + rand_r() % 4096;
+  int seed = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
   int nthr = omp_get_max_threads();
 # pragma omp parallel num_threads(nthr)
   {
@@ -80,16 +79,16 @@ class DropoutOp : public Operator {
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
     if (ctx.is_train) {
-      CHECK_EQ(out_data.size(), 2);
+      CHECK_EQ(out_data.size(), 2U);
     }
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
     if (ctx.is_train) {
       Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
-#if defined(USE_STATIC_MKL) && defined(_OPENMP)
+#if defined(USE_MKL) && defined(_OPENMP)
       DType* outptr = out.dptr_;
       DType* dataptr = data.dptr_;
       int* maskptr = reinterpret_cast<int*>(mask.dptr_);
@@ -119,13 +118,13 @@ class DropoutOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
-#if defined(USE_STATIC_MKL) && defined(_OPENMP)
+#if defined(USE_MKL) && defined(_OPENMP)
       DType* ingradptr = gdata.dptr_;
       DType* outgradptr = grad.dptr_;
       int* maskptr = reinterpret_cast<int*>(mask.dptr_);
@@ -164,7 +163,7 @@ class DropoutProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1);
+    CHECK_EQ(in_shape->size(), 1U);
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
@@ -176,7 +175,7 @@ class DropoutProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1);
+    CHECK_EQ(in_type->size(), 1U);
     int dtype = in_type->at(0);
 
     if (dtype == -1) {
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index c3f237265d80..129cb3360dce 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -53,8 +53,8 @@ template<int n_in, int n_out>
 inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), n_in) << " in operator " << attrs.name;
-  CHECK_EQ(out_attrs->size(), n_out) << " in operator " << attrs.name;
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
   return ElemwiseAttr<TShape, shape_is_none, shape_assign, true>(
     attrs, in_attrs, out_attrs, TShape());
 }
@@ -63,8 +63,8 @@ template<int n_in, int n_out>
 inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), n_in) << " in operator " << attrs.name;
-  CHECK_EQ(out_attrs->size(), n_out) << " in operator " << attrs.name;
+  CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in)) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out)) << " in operator " << attrs.name;
   return ElemwiseAttr<int, type_is_none, type_assign, true>(
     attrs, in_attrs, out_attrs, -1);
 }
@@ -74,11 +74,7 @@ struct ElemwiseGradUseIn {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    for (auto& h : n->inputs) {
-      heads.push_back(h);
-    }
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+    return MakeNonlossGradNode(op_name, n, ograds, n->inputs, n->attrs.dict);
   }
 };
 
@@ -87,12 +83,12 @@ struct ElemwiseGradUseOut {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    std::vector<nnvm::NodeEntry> heads;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
       heads.emplace_back(nnvm::NodeEntry{n, i, 0});
     }
-    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+    return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
   }
 };
 
@@ -101,7 +97,7 @@ struct ElemwiseGradUseNone {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) {
-    return MakeGradNode(op_name, n, ograds, n->attrs.dict);
+    return MakeNonlossGradNode(op_name, n, ograds, {}, n->attrs.dict);
   }
 };
 
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 695b6d1c8db8..5cd9692b3453 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -14,6 +14,7 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./elemwise_op_common.h"
 
 
 namespace mxnet {
@@ -60,7 +61,7 @@ class FullyConnectedOp : public Operator {
     CHECK_EQ(req[fullc::kOut], kWriteTo);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     // TODO(bing): check the BLAS Handle, be careful
     // maybe need blas handle from context
     // TODO(bing): judge shape to remove flatten op
@@ -93,7 +94,7 @@ class FullyConnectedOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     size_t expected = param_.no_bias ? 2 : 3;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
@@ -164,11 +165,11 @@ class FullyConnectedProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
+      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
     } else {
-      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
     }
-    CHECK_EQ(out_shape->size(), 1);
+    CHECK_EQ(out_shape->size(), 1U);
     TShape dshape = (*in_shape)[fullc::kData];
     TShape oshape = (*out_shape)[0];
     // require data to be known
@@ -191,21 +192,11 @@ class FullyConnectedProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (index_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
+    CHECK_GE(in_type->size(), 1U);
+    nnvm::NodeAttrs attrs;
+    attrs.name = "FullyConnected";
+    return ElemwiseAttr<int, type_is_none, type_assign, true>(
+      attrs, in_type, out_type, -1);
   }
 
   OperatorProperty* Copy() const override {
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 0e6d9d82d868..0c790dde0afa 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -24,9 +24,9 @@ Operator* CreateOp<cpu>(FullyConnectedParam param, int dtype,
 #if MXNET_USE_MKL2017 == 1
   switch (dtype) {
   case mshadow::kFloat32:
-    return new MKLFullyConnectedOp<cpu, float>(param);
+    return new MKLFullyConnectedOp<cpu, float>(param, *in_shape, *out_shape);
   case mshadow::kFloat64:
-    return new MKLFullyConnectedOp<cpu, double>(param);
+    return new MKLFullyConnectedOp<cpu, double>(param, *in_shape, *out_shape);
   default:
     LOG(INFO) << MKLFullyConnectedOp<cpu, float>::getName() << " Skip MKL optimization";
     break;
@@ -36,15 +36,11 @@ Operator* CreateOp<cpu>(FullyConnectedParam param, int dtype,
   const size_t batch_size = (*in_shape)[0][0];
   // nnp_fully_connected_inference will do optimization for batch-size = 1
   // nnp_fully_connected_output will do optimization for batch-size > 1
-  // but just found FullyConnected in NNPACK result is wrong when batch_size != 2^n
-  // so here only using NNPACK when batch_size = 2^n.
-  if ((batch_size == 1) || ((batch_size > 1) && (!(batch_size & (batch_size - 1))))) {
-    switch (dtype) {
-    case mshadow::kFloat32:
-      return new NNPACKFullyConnectedOp<cpu, float>(param);
-    default:
-      break;
-    }
+  switch (dtype) {
+  case mshadow::kFloat32:
+    return new NNPACKFullyConnectedOp<cpu, float>(param);
+  default:
+    break;
   }
 #endif
   switch (dtype) {
@@ -78,13 +74,23 @@ Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector<TShape>
 DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 
 MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
-.describe(R"(Apply matrix multiplication to input then add a bias.
-It maps the input of shape `(batch_size, input_dim)` to the shape of
-`(batch_size, num_hidden)`. Learnable parameters include the weights
-of the linear transform and an optional bias vector.)")
-.add_argument("data", "Symbol", "Input data to the FullyConnectedOp.")
-.add_argument("weight", "Symbol", "Weight matrix.")
-.add_argument("bias", "Symbol", "Bias parameter.")
+.describe(R"code(Apply a linear transformation: :math:`Y = XW^T + b`.
+
+Shapes:
+
+- **data**: `(batch_size, input_dim)`
+- **weight**: `(num_hidden, input_dim)`
+- **bias**: `(num_hidden,)`
+- **out**: `(batch_size, num_hidden)`
+
+The learnable parameters include both ``weight`` and ``bias``.
+
+If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data.")
+.add_argument("weight", "ndarray-or-symbol", "Weight matrix.")
+.add_argument("bias", "ndarray-or-symbol", "Bias parameter.")
 .add_arguments(FullyConnectedParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/grid_generator-inl.h b/src/operator/grid_generator-inl.h
index 365ad56bdd44..1f88cf4935da 100644
--- a/src/operator/grid_generator-inl.h
+++ b/src/operator/grid_generator-inl.h
@@ -1,7 +1,7 @@
 /*!
  * Copyright (c) 2017 by Contributors
  * \file grid_generator-inl.h
- * \brief 
+ * \brief
  * The operator generate sampling grid
  * \author Xu Dong
 */
@@ -60,8 +60,8 @@ class GridGeneratorOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(req[grid::kOut], kWriteTo);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     switch (param_.transform_type) {
       case grid::kAffine: {
@@ -119,8 +119,8 @@ class GridGeneratorOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     switch (param_.transform_type) {
       case grid::kAffine: {
@@ -189,29 +189,29 @@ class GridGeneratorProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     const TShape &lshape = (*in_shape)[grid::kData];
     if (lshape.ndim() ==  0) return false;
     out_shape->clear();
     switch (param_.transform_type) {
       case grid::kAffine: {
-        CHECK_EQ(lshape.ndim(), 2) \
+        CHECK_EQ(lshape.ndim(), 2U) \
           << "if transform_type is affine, data is affine matrix"
           "affine matrix should be 2D in batch-num_hidden";
-        CHECK_EQ(lshape[1], 6) << "incorrect data shape[1], should be 6";
-        CHECK_GT(param_.target_shape[0], 0) \
+        CHECK_EQ(lshape[1], 6U) << "incorrect data shape[1], should be 6";
+        CHECK_GT(param_.target_shape[0], 0U) \
             << "incorrect target_shape: " << param_.target_shape[0];
-        CHECK_GT(param_.target_shape[1], 0) \
+        CHECK_GT(param_.target_shape[1], 0U) \
             << "incorrect target_shape: " << param_.target_shape[1];
         out_shape->push_back(Shape4(lshape[0], 2, param_.target_shape[0], param_.target_shape[1]));
         out_shape->push_back(Shape2(3, param_.target_shape[0] * param_.target_shape[1]));
         break;
       }
       case grid::kWarp: {
-        CHECK_EQ(lshape.ndim(), 4) \
+        CHECK_EQ(lshape.ndim(), 4U) \
           << "if transform_type is warp, data is optical flow"
              "optical flow should be 4D in batch-num_hidden-y-x";
-        CHECK_EQ(lshape[1], 2) << "incorrect data shape[1], should be 2";
+        CHECK_EQ(lshape[1], 2U) << "incorrect data shape[1], should be 2";
         out_shape->push_back(lshape);
         out_shape->push_back(Shape3(2, lshape[2], lshape[3]));
         break;
diff --git a/src/operator/identity_attach_KL_sparse_reg-inl.h b/src/operator/identity_attach_KL_sparse_reg-inl.h
index 4a68158352da..ca7eab0f399e 100644
--- a/src/operator/identity_attach_KL_sparse_reg-inl.h
+++ b/src/operator/identity_attach_KL_sparse_reg-inl.h
@@ -60,8 +60,8 @@ class IdentityAttachKLSparseRegOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[sparsereg::kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> out = out_data[sparsereg::kOut].FlatTo2D<xpu, real_t>(s);
@@ -114,7 +114,7 @@ class IdentityAttachKLSparseRegProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1);
+    CHECK_EQ(in_shape->size(), 1U);
     const TShape &dshape = in_shape->at(sparsereg::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
diff --git a/src/operator/instance_norm-inl.h b/src/operator/instance_norm-inl.h
index 9404d4efb045..a9e10a2bc973 100644
--- a/src/operator/instance_norm-inl.h
+++ b/src/operator/instance_norm-inl.h
@@ -44,11 +44,11 @@ class InstanceNormOp : public Operator {
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(out_data.size(), 3U);
 
     CHECK_GE(in_data[instance_norm::kData].ndim(), 3)
-        << "InstanceNorm only supports input tensors of rank > 2.";
+        << "InstanceNorm only supports input tensors of rank >= 3.";
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
     int n = in_data[instance_norm::kData].size(0);
@@ -91,10 +91,10 @@ class InstanceNormOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 3);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 3U);
+    CHECK_EQ(out_data.size(), 3U);
 
-    CHECK_GE(in_data[instance_norm::kData].Size(), 3)
+    CHECK_GE(in_data[instance_norm::kData].ndim(), 3U)
         << "InstanceNorm only supports input tensors of rank > 2.";
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -187,7 +187,7 @@ class InstanceNormProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 3) << "Input:[data]";
+    CHECK_EQ(in_shape->size(), 3U) << "Input:[data]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
 
diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index e6faac7f0cc8..90ccf10e1644 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -67,8 +67,8 @@ class L2NormalizationOp : public Operator {
     using namespace mshadow::expr;
     if (req[l2_normalization::kOut] == kNullOp) return;
     CHECK_EQ(req[l2_normalization::kOut], kWriteTo);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     TShape orig_shape = in_data[l2_normalization::kData].shape_;
     if (param_.mode == l2_normalization::kInstance) {
@@ -83,7 +83,7 @@ class L2NormalizationOp : public Operator {
       norm = F<mxnet::op::mshadow_op::square_root>(norm + param_.eps);
       out = data / broadcast<0>(norm, out.shape_);
     } else if (param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(orig_shape.ndim(), 3);
+      CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3> data = in_data[l2_normalization::kData]
@@ -97,7 +97,7 @@ class L2NormalizationOp : public Operator {
       norm = F<mxnet::op::mshadow_op::square_root>(norm + param_.eps);
       out = data / broadcast_with_axis(norm, 0, orig_shape[1]);
     } else if (param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(orig_shape.ndim(), 3);
+      CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3> data = in_data[l2_normalization::kData]
@@ -124,9 +124,9 @@ class L2NormalizationOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK(in_data.size() == 1U && in_grad.size() == 1U);
+    CHECK_EQ(req.size(), 1U);
 
     Stream<xpu> *s = ctx.get_stream<xpu>();
     TShape orig_shape = out_data[l2_normalization::kOut].shape_;
@@ -147,7 +147,7 @@ class L2NormalizationOp : public Operator {
         (grad_out - data * broadcast<0>(temp, data.shape_)) /
         broadcast<0>(norm, data.shape_));
     } else if (param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(orig_shape.ndim(), 3);
+      CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3> data = out_data[l2_normalization::kOut]
@@ -166,7 +166,7 @@ class L2NormalizationOp : public Operator {
         (grad_out - data * broadcast_with_axis(temp, 0, orig_shape[1])) /
         broadcast_with_axis(norm, 0, orig_shape[1]));
     } else if (param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(orig_shape.ndim(), 3);
+      CHECK_GE(orig_shape.ndim(), 3U);
       Shape<3> dshape = Shape3(orig_shape[0], orig_shape[1],
         orig_shape.ProdShape(2, orig_shape.ndim()));
       Tensor<xpu, 3> data = out_data[l2_normalization::kOut]
@@ -224,7 +224,7 @@ class L2NormalizationProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) << "L2Normalization layer only accepts data as input";
+    CHECK_EQ(in_shape->size(), 1U) << "L2Normalization layer only accepts data as input";
     const TShape &dshape = (*in_shape)[l2_normalization::kData];
     // require data to be known
     if ((*in_shape)[l2_normalization::kData].ndim() == 0) return false;
@@ -233,12 +233,12 @@ class L2NormalizationProp : public OperatorProperty {
     if (param_.mode == l2_normalization::kInstance) {
       out_shape->push_back(Shape1(dshape[0]));
     } else if (param_.mode == l2_normalization::kChannel) {
-      CHECK_GE(dshape.ndim(), 3) << "At lease 3 dimensions required in channel mode";
+      CHECK_GE(dshape.ndim(), 3U) << "At lease 3 dimensions required in channel mode";
       TShape norm_shape = dshape;
       norm_shape[1] = 1;
       out_shape->push_back(norm_shape);
     } else if (param_.mode == l2_normalization::kSpatial) {
-      CHECK_GE(dshape.ndim(), 3) << "At lease 3 dimensions required in spatial mode";
+      CHECK_GE(dshape.ndim(), 3U) << "At lease 3 dimensions required in spatial mode";
       out_shape->push_back(Shape2(dshape[0], dshape[1]));
     } else {
       return false;
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index ad607f723185..b0a5c0e53300 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -127,7 +127,7 @@ class LeakyReLUOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     size_t expected = param_.act_type == leakyrelu::kPReLU ? 2 : 1;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK_EQ(req.size(), expected);
     CHECK_EQ(in_data.size(), expected);
     Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -198,9 +198,9 @@ class LeakyReLUProp : public OperatorProperty {
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
     if (param_.act_type == leakyrelu::kPReLU) {
-      CHECK_EQ(in_shape->size(), 2) << "Input:[data, gamma]";
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, gamma]";
     } else {
-      CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+      CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     }
     const TShape &dshape = in_shape->at(leakyrelu::kData);
     if (dshape.ndim() == 0) return false;
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index b4296c856d99..2b1f88630185 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -22,8 +22,19 @@ Operator *LeakyReLUProp::CreateOperator(Context ctx) const {
 DMLC_REGISTER_PARAMETER(LeakyReLUParam);
 
 MXNET_REGISTER_OP_PROPERTY(LeakyReLU, LeakyReLUProp)
-.describe("Apply activation function to input.")
-.add_argument("data", "Symbol", "Input data to activation function.")
+.describe(R"code(Leaky ReLu activation
+
+The following types are supported:
+
+- *elu*: ``y = x > 0 ? x : slop * (exp(x)-1)``
+- *leaky*: ``y = x > 0 ? x : slope * x``
+- *prelu*: same as *leaky* but the ``slope`` is learnable.
+- *rrelu*: same as *leaky* but the ``slope`` is uniformly randomly chosen from
+  *[lower_bound, upper_bound)* for training, while fixed to be
+  *(lower_bound+upper_bound)/2* for inference.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to activation function.")
 .add_arguments(LeakyReLUParam::__FIELDS__());
 
 NNVM_REGISTER_OP(LeakyReLU)
@@ -36,4 +47,3 @@ NNVM_REGISTER_OP(LeakyReLU)
 
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/loss_binary_op-inl.h b/src/operator/loss_binary_op-inl.h
index 955f007f129a..a61cee7e3d3c 100644
--- a/src/operator/loss_binary_op-inl.h
+++ b/src/operator/loss_binary_op-inl.h
@@ -18,9 +18,9 @@ namespace op {
 inline bool SoftmaxCrossEntropyShape(const nnvm::NodeAttrs& attrs,
                                      std::vector<TShape> *in_attrs,
                                      std::vector<TShape> *out_attrs) {
-  CHECK_EQ((*in_attrs)[0].ndim(), 2)
+  CHECK_EQ((*in_attrs)[0].ndim(), 2U)
       << "SoftmaxCrossEntropy only accept 2D data";
-  CHECK_EQ((*in_attrs)[1].ndim(), 1)
+  CHECK_EQ((*in_attrs)[1].ndim(), 1U)
       << "SoftmaxCrossEntropy only accept 1D label";
   CHECK_EQ((*in_attrs)[0][0], (*in_attrs)[1][0])
       << "SoftmaxCrossEntropy: data label shape mismatch";
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
old mode 100644
new mode 100755
index 6e7d3f85ae16..52df57af618c
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -55,10 +55,10 @@ class LocalResponseNormOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     // TODO(xxx): Test with gradient chceker
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
     // CHECK_EQ(req.size(), 2);
-    CHECK_EQ(param_.nsize % 2, 1) << "LRN only supports odd values for local_size";
+    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
     const real_t salpha = param_.alpha / param_.nsize;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> data = in_data[lrn_enum::kData].get<xpu, 4, real_t>(s);
@@ -77,9 +77,9 @@ class LocalResponseNormOp : public Operator {
                         const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
     const real_t salpha = param_.alpha / param_.nsize;
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4> grad = out_grad[lrn_enum::kOut].get<xpu, 4, real_t>(s);
@@ -115,7 +115,7 @@ class LocalResponseNormProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
@@ -124,6 +124,27 @@ class LocalResponseNormProp : public OperatorProperty {
     return true;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (index_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
+                                       << "Expected " << dtype << " v.s. given "
+                                       << (*in_type)[i] << " at " << ListArguments()[i];
+      }
+    }
+    int n_out = this->ListOutputs().size();
+    out_type->clear();
+    for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype);
+    return true;
+  }
+
   OperatorProperty* Copy() const override {
     auto ptr = new LocalResponseNormProp();
     ptr->param_ = param_;
diff --git a/src/operator/lrn.cu b/src/operator/lrn.cu
old mode 100644
new mode 100755
index 0d540b5eb92f..681de80508c7
--- a/src/operator/lrn.cu
+++ b/src/operator/lrn.cu
@@ -14,17 +14,20 @@ namespace mxnet {
 namespace op {
 template<>
 Operator* CreateOp<gpu>(LRNParam param, int dtype) {
+  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
-  return new CuDNNLocalResponseNormOp(param);
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new CuDNNLocalResponseNormOp<DType>(param);
+  })
 #else
 #if CUDA_VERSION == 7000
   LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled."
              << "Please upgrade CUDA to 7.5+ or use CUDNN";
-  return NULL;
 #else
-  return new LocalResponseNormOp<gpu>(param);
+  op = new LocalResponseNormOp<gpu>(param);
 #endif  // CUDA_VERSION
 #endif  // MXNET_USE_CUDNN
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/make_loss-inl.h b/src/operator/make_loss-inl.h
index 1ecff2f67265..27c5167da76b 100644
--- a/src/operator/make_loss-inl.h
+++ b/src/operator/make_loss-inl.h
@@ -60,8 +60,8 @@ class MakeLossOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1) << "MakeLoss can only be used to one input";
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U) << "MakeLoss can only be used to one input";
+    CHECK_EQ(out_data.size(), 1U);
     if (req[make_loss_enum::kOut] != kWriteInplace) {
       Stream<xpu> *s = ctx.get_stream<xpu>();
       Tensor<xpu, 2, DType> data = in_data[make_loss_enum::kData].FlatTo2D<xpu, DType>(s);
@@ -121,7 +121,7 @@ class MakeLossProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1);
+    CHECK_EQ(in_shape->size(), 1U);
     const TShape &dshape = in_shape->at(make_loss_enum::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
@@ -132,7 +132,7 @@ class MakeLossProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1);
+    CHECK_EQ(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "Input must have specified type";
     out_type->clear();
diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h
index b5fe6d0369d6..4cfba6827793 100644
--- a/src/operator/mkl/mkl_batch_norm-inl.h
+++ b/src/operator/mkl/mkl_batch_norm-inl.h
@@ -46,14 +46,18 @@ class MKLBatchNormOp : public Operator {
     fwd_bottom_data = MKLData<DType>::create();
     bwd_top_diff = MKLData<DType>::create();
     bwd_bottom_diff = MKLData<DType>::create();
+    scaleShift_space.dptr = NULL;
+    scaleShiftDiff_space.dptr = NULL;
   }
   virtual ~MKLBatchNormOp() {
     if (batchNormFwdInference != NULL) dnnDelete<DType>(batchNormFwdInference);
     if (batchNormFwdTraining != NULL) dnnDelete<DType>(batchNormFwdTraining);
     if (batchNormBwdScaleShift != NULL) dnnDelete<DType>(batchNormBwdScaleShift);
     dnnLayoutDelete<DType>(layout_usr_);
-    Storage::Get()->Free(scaleShift_space);
-    Storage::Get()->Free(scaleShiftDiff_space);
+    if (scaleShift_space.dptr)
+      Storage::Get()->Free(scaleShift_space);
+    if (scaleShiftDiff_space.dptr)
+      Storage::Get()->Free(scaleShiftDiff_space);
   }
   static std::string getName() {
     return "MKLBatchNormOp";
@@ -164,7 +168,9 @@ class MKLBatchNormOp : public Operator {
     bottom_data =
           reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[batchnorm::kData]));
 #endif
-
+    int bwd_flags = dnnUseScaleShift;
+    if (param_.use_global_stats)
+      bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance;
 #if MKL_EXPERIMENTAL == 1
     if (NULL != bottom_data) {
       // Is it the first pass? Create a primitive.
@@ -192,7 +198,7 @@ class MKLBatchNormOp : public Operator {
         bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc);
 
         e = dnnBatchNormalizationCreateBackward_v2<DType>(
-                &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, dnnUseScaleShift);
+                &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags);
         CHECK_EQ(e, E_SUCCESS);
       }
     }
@@ -209,7 +215,7 @@ class MKLBatchNormOp : public Operator {
         CHECK_EQ(e, E_SUCCESS);
 
         e = dnnBatchNormalizationCreateBackward_v2<DType>(
-              &batchNormBwdScaleShift, NULL, layout_usr_, eps_, dnnUseScaleShift);
+              &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags);
         CHECK_EQ(e, E_SUCCESS);
       }
       bottom_data =
@@ -220,24 +226,21 @@ class MKLBatchNormOp : public Operator {
      // use_weight_bias_
     for (int i = 0; i < channels_; i++) {
         scaleShift_buf[i] = (slope.dptr_)[i];
-        scaleShift_buf[channels_ + i] = (bias.dptr_)[i];
+    }
+    for (int i = 0; i < channels_; i++) {
+      scaleShift_buf[channels_ + i] = (bias.dptr_)[i];
     }
 
     void* BatchNorm_res[dnnResourceNumber];
     BatchNorm_res[dnnResourceSrc] = bottom_data;
     BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
-    if (fwd_top_data->conversion_needed()) {
+    std::shared_ptr<MKLMemHolder> topDnnChunk = NULL;
 #if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemHolder> topDnnChunk = out_data[batchnorm::kOut].Mkl_mem_;
-      topDnnChunk->set_prv_descriptor(fwd_top_data);
+    topDnnChunk = out_data[batchnorm::kOut].Mkl_mem_;
 #endif
-      BatchNorm_res[dnnResourceDst] = fwd_top_data->prv_ptr();
-    } else {
-      BatchNorm_res[dnnResourceDst] =
-        reinterpret_cast<void *>(out.dptr_);
-    }
-
-    if (ctx.is_train) {
+    BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_,
+      fwd_top_data, topDnnChunk);
+    if (ctx.is_train && !param_.use_global_stats) {
       Tensor<xpu, 1, DType> mean = out_data[batchnorm::kMean].get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> var = out_data[batchnorm::kVar].get<xpu, 1, DType>(s);
       CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo);
@@ -312,37 +315,40 @@ class MKLBatchNormOp : public Operator {
     void* BatchNorm_res[dnnResourceNumber];
     BatchNorm_res[dnnResourceSrc] = bottom_data;
     BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr;
-    if (ctx.is_train) {
-      moving_mean = moving_mean * param_.momentum + mean * (1 - param_.momentum);
-      moving_var = moving_var * param_.momentum + var * (1 - param_.momentum);
+    if (ctx.is_train && !param_.use_global_stats) {
+      int size = mean.size(0);  // Tensor<xpu, 1, DType>
+      float * moving_mean_ptr = reinterpret_cast<float*>(moving_mean.dptr_);
+      float * mean_ptr = reinterpret_cast<float*>(mean.dptr_);
+      float * moving_var_ptr = reinterpret_cast<float*>(moving_var.dptr_);
+      float * var_ptr = reinterpret_cast<float*>(var.dptr_);
+      float minus_mom = (1 - param_.momentum);
+      for (int i = 0; i < size; i++) {
+        moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum
+          + mean_ptr[i] * minus_mom;
+      }
+      for (int i = 0; i < size; i++) {
+        moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum
+          + var_ptr[i] * minus_mom;
+      }
+
+
+      BatchNorm_res[dnnResourceMean] = mean.dptr_;
+      BatchNorm_res[dnnResourceVariance] = var.dptr_;
+    } else {
+      BatchNorm_res[dnnResourceMean] = moving_mean.dptr_;
+      BatchNorm_res[dnnResourceVariance] = moving_var.dptr_;
     }
-    BatchNorm_res[dnnResourceMean] = mean.dptr_;
-    BatchNorm_res[dnnResourceVariance] = var.dptr_;
+
     std::shared_ptr<MKLMemHolder> bottom_diff_mem =
 #if MKL_EXPERIMENTAL == 1
       in_grad[batchnorm::kData].Mkl_mem_;
 #else
     NULL;
 #endif
-    if (bwd_bottom_diff->conversion_needed()) {
-#if MKL_EXPERIMENTAL == 1
-      bottom_diff_mem->set_prv_descriptor(bwd_bottom_diff);
-#endif
-      BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->prv_ptr();
-    } else {
-      BatchNorm_res[dnnResourceDiffSrc] = grad_in.dptr_;
-    }
-
-
-    std::shared_ptr<MKLMemHolder> top_diff_mem =
-#if MKL_EXPERIMENTAL == 1
-      out_grad[batchnorm::kOut].Mkl_mem_;
-#else
-      NULL;
-#endif
+    BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_,
+      bwd_bottom_diff, bottom_diff_mem);
     BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_,
-                                                                      true, top_diff_mem);
-
+             true, out_grad[batchnorm::kOut]);
     BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr;
     e = dnnExecute<DType>(batchNormBwdScaleShift, BatchNorm_res);
     CHECK_EQ(e, E_SUCCESS);
@@ -351,18 +357,23 @@ class MKLBatchNormOp : public Operator {
       bwd_bottom_diff->convert_from_prv(grad_in.dptr_);
     }
 #endif
+    DType * scaleShiftDiff_buf = reinterpret_cast<DType*>(scaleShiftDiff_space.dptr);
     if (!param_.fix_gamma) {
       // Store ScaleShift blobs
       DType* diff_scale = gslope.dptr_;
-      DType* diff_shift = gbias.dptr_;
-      DType * scaleShiftDiff_buf = reinterpret_cast<DType*>(scaleShiftDiff_space.dptr);
       for (int i = 0; i < channels_; i++) {
         diff_scale[i] = scaleShiftDiff_buf[i];
-        diff_shift[i] = scaleShiftDiff_buf[channels_ + i];
       }
     } else {
-      Assign(gslope, req[batchnorm::kGamma], 0.0f);
-      Assign(gbias, req[batchnorm::kBeta], sumall_except_dim<1>(grad));
+      int gslope_size = gslope.size(0);
+      float * gslope_ptr = reinterpret_cast<float*>(gslope.dptr_);
+      for (int i = 0; i < gslope_size; i++) {
+        *gslope_ptr++ = 0.0f;
+      }
+    }
+    DType* diff_shift = gbias.dptr_;
+    for (int i = 0; i < channels_; i++) {
+      diff_shift[i] = scaleShiftDiff_buf[channels_ + i];
     }
   }
 
diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h
index 02b144875d8a..cc14e85cace5 100644
--- a/src/operator/mkl/mkl_concat-inl.h
+++ b/src/operator/mkl/mkl_concat-inl.h
@@ -217,20 +217,12 @@ class MKLConcatOp : public Operator {
         = reinterpret_cast<void*>(bottom_data[i]);
     }
 
+    std::shared_ptr<MKLMemHolder> top_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-    if (fwd_top_data_->conversion_needed()) {
-      std::shared_ptr<MKLMemHolder> top_mem = out_data[concat_enum::kOut].Mkl_mem_;
-        top_mem->set_prv_descriptor(fwd_top_data_);
-      concat_res[dnnResourceDst] =
-        reinterpret_cast<void*>(fwd_top_data_->prv_ptr());
-    } else {
+    top_mem = out_data[concat_enum::kOut].Mkl_mem_;
 #endif
-    concat_res[dnnResourceDst] =
-      reinterpret_cast<void*>(out.dptr_);
-#if MKL_EXPERIMENTAL == 1
-    }
-#endif
-
+    concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_,
+      fwd_top_data_, top_mem);
     e = dnnExecute<DType>(concatFwd_, concat_res);
     CHECK_EQ(e, E_SUCCESS);
     delete[] split_channels_;
@@ -290,26 +282,15 @@ class MKLConcatOp : public Operator {
 
     dnnError_t e;
     void *concat_res[dnnResourceNumber];
-    std::shared_ptr<MKLMemHolder> out_grad_mem =
-#if MKL_EXPERIMENTAL == 1
-      out_grad[concat_enum::kOut].Mkl_mem_;
-#else
-      NULL;
-#endif
     concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true,
-      out_grad_mem);
+      out_grad[concat_enum::kOut]);
     for (size_t i = 0; i < num_concats_; ++i) {
+      std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-      if (bwd_bottom_diff_[i]->conversion_needed()) {
-        std::shared_ptr<MKLMemHolder> bottom_diff_mem = in_grad[i].Mkl_mem_;
-        bottom_diff_mem->set_prv_descriptor(bwd_bottom_diff_[i]);
-        concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->prv_ptr();
-      } else {
-#endif
-        concat_res[dnnResourceMultipleDst + i] = grad_in[i].dptr_;
-#if MKL_EXPERIMENTAL == 1
-      }
+      bottom_diff_mem = in_grad[i].Mkl_mem_;
 #endif
+      concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr(
+        grad_in[i].dptr_, bwd_bottom_diff_[i], bottom_diff_mem);
     }
     e = dnnExecute<DType>(concatBwd_, concat_res);
     CHECK_EQ(e, E_SUCCESS);
diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h
index 828095d74de2..f8aa1f8ed63e 100644
--- a/src/operator/mkl/mkl_convolution-inl.h
+++ b/src/operator/mkl/mkl_convolution-inl.h
@@ -21,7 +21,7 @@
 *******************************************************************************/
 #ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
 #define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_
-
+#include <mxnet/storage.h>
 #include <dmlc/logging.h>
 #include <dmlc/parameter.h>
 #include <mxnet/operator.h>
@@ -31,6 +31,7 @@
 #include <string>
 #include <utility>
 #include "../operator_common.h"
+#include "../convolution-inl.h"
 #include "./mkl_util-inl.h"
 
 namespace mxnet {
@@ -294,50 +295,23 @@ class MKLConvolutionOp : public Operator {
     out_ptr = out.dptr_;
     int status;
     void *res_convolutionFwd[dnnResourceNumber];
-    std::shared_ptr<MKLMemHolder> in_data_mem =
-#if MKL_EXPERIMENTAL == 1
-      in_data[conv::kData].Mkl_mem_;
-#else
-      NULL;
-#endif
     res_convolutionFwd[dnnResourceSrc] =
-      fwd_bottom_data->get_converted_prv(data_ptr, false, in_data_mem);
-    std::shared_ptr<MKLMemHolder> in_weight_mem =
-#if MKL_EXPERIMENTAL == 1
-      in_data[conv::kWeight].Mkl_mem_;
-#else
-      NULL;
-#endif
+      fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]);
     res_convolutionFwd[dnnResourceFilter] =
-      fwd_filter_data->get_converted_prv(wmat_ptr, true, in_weight_mem);
+      fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]);
     if (!param_.no_bias) {
       Tensor<xpu, 1, DType> bias =
         mkl_experimental_direct_get<xpu, 1, DType>(in_data[conv::kBias], s);
-      std::shared_ptr<MKLMemHolder> in_bias_mem =
-#if MKL_EXPERIMENTAL == 1
-       in_data[conv::kBias].Mkl_mem_;
-#else
-       NULL;
-#endif
       res_convolutionFwd[dnnResourceBias] =
-        fwd_bias_data->get_converted_prv(bias.dptr_, true, in_bias_mem);
+        fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]);
     }
 
-    std::shared_ptr<MKLMemHolder> top_mem =
+    std::shared_ptr<MKLMemHolder> top_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-     out_data[conv::kOut].Mkl_mem_;
-#else
-     NULL;
+    top_mem = out_data[conv::kOut].Mkl_mem_;
 #endif
-    if (fwd_top_data->conversion_needed()) {
-      res_convolutionFwd[dnnResourceDst] =
-        reinterpret_cast<void *>(fwd_top_data->prv_ptr());
-#if MKL_EXPERIMENTAL == 1
-      top_mem->set_prv_descriptor(fwd_top_data);
-#endif
-    } else {
-      res_convolutionFwd[dnnResourceDst] = out_ptr;
-    }
+    res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr,
+      fwd_top_data, top_mem);
     status = dnnExecute<DType>(convolutionFwd, res_convolutionFwd);
     CHECK_EQ(status, 0) << "Forward convolution failed with status " << status;
 #if MKL_EXPERIMENTAL == 0
@@ -346,7 +320,22 @@ class MKLConvolutionOp : public Operator {
     }
 #endif
   }
-
+  void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) {
+    int blob_byte_size = blob_size * sizeof(DType);
+    *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU());
+    memcpy(pws->dptr, src, blob_byte_size);
+  }
+  void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) {
+    DType *dst = reinterpret_cast<DType*>(dst_);
+    DType *src = reinterpret_cast<DType*>(pws->dptr);
+#pragma omp parallel for
+    for (int i = 0; i < blob_size; i++) {
+      dst[i] += src[i];
+    }
+    if (pws->dptr)
+      Storage::Get()->Free(*pws);
+    pws->dptr = NULL;
+  }
   virtual void Backward(const OpContext &ctx,
                         const std::vector<TBlob> &out_grad,
                         const std::vector<TBlob> &in_data,
@@ -388,33 +377,23 @@ class MKLConvolutionOp : public Operator {
     int status;
     if (req[0]) {
       void *res_convolutionBwdData[dnnResourceNumber];
-      std::shared_ptr<MKLMemHolder> out_grad_mem =
-#if MKL_EXPERIMENTAL == 1
-       out_grad[conv::kOut].Mkl_mem_;
-#else
-       NULL;
-#endif
       res_convolutionBwdData[dnnResourceDiffDst] =
-        bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad_mem);
-      std::shared_ptr<MKLMemHolder> in_weight_mem =
-#if MKL_EXPERIMENTAL == 1
-        in_data[conv::kWeight].Mkl_mem_;
-#else
-        NULL;
-#endif
+        bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
+
       res_convolutionBwdData[dnnResourceFilter] =
-        bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_weight_mem);
-     if (bwdd_bottom_diff->conversion_needed()) {
-       res_convolutionBwdData[dnnResourceDiffSrc] =
-         reinterpret_cast<void *>(bwdd_bottom_diff->prv_ptr());
+        bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]);
+     Storage::Handle addtoWorkspace;
+     if (req[0] == kAddTo) {
+       // wait mkl support addto mode
+       AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace);
+     }
+
+     std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-       std::shared_ptr<MKLMemHolder> bottom_diff_mem =
-         in_grad[conv::kData].Mkl_mem_;
-       bottom_diff_mem->set_prv_descriptor(bwdd_bottom_diff);
+     bottom_diff_mem = in_grad[conv::kData].Mkl_mem_;
 #endif
-     } else {
-       res_convolutionBwdData[dnnResourceDiffSrc] = gdata.dptr_;
-     }
+     res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_,
+       bwdd_bottom_diff, bottom_diff_mem);
      status = dnnExecute<DType>(convolutionBwdData, res_convolutionBwdData);
      CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status;
 #if MKL_EXPERIMENTAL == 0
@@ -422,36 +401,34 @@ class MKLConvolutionOp : public Operator {
        bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
      }
 #endif
+     if (req[0] == kAddTo) {
+       if (bwdd_bottom_diff->conversion_needed()) {
+         bwdd_bottom_diff->convert_from_prv(gdata.dptr_);
+       }
+      AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size());
+     }
     }
     if (req[1]) {
       void *res_convolutionBwdFilter[dnnResourceNumber];
-      std::shared_ptr<MKLMemHolder> out_bias_mem =
-#if MKL_EXPERIMENTAL == 1
-        out_grad[conv::kOut].Mkl_mem_;
-#else
-        NULL;
-#endif
+
       res_convolutionBwdFilter[dnnResourceDiffDst] =
-        bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_bias_mem);
-#if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemHolder> in_data_mem = in_data[conv::kData].Mkl_mem_;
-#else
-      std::shared_ptr<MKLMemHolder> in_data_mem = NULL;
-#endif
+        bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
+
       res_convolutionBwdFilter[dnnResourceSrc] =
         bwdf_bottom_data->get_converted_prv(data.dptr_, false,
-                                            in_data_mem);
-     if (bwdf_filter_diff->conversion_needed()) {
+          in_data[conv::kData]);
+     Storage::Handle addtoWorkspace;
+     if (req[1] == kAddTo) {
+       // wait mkl support addto mode
+       AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace);
+     }
+
+     std::shared_ptr<MKLMemHolder> gwamt_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-       std::shared_ptr<MKLMemHolder> gwamt_mem =
-         in_grad[conv::kWeight].Mkl_mem_;
-       gwamt_mem->set_prv_descriptor(bwdf_filter_diff);
+     gwamt_mem = in_grad[conv::kWeight].Mkl_mem_;
 #endif
-       res_convolutionBwdFilter[dnnResourceDiffFilter] =
-         reinterpret_cast<void *>(bwdf_filter_diff->prv_ptr());
-     } else {
-       res_convolutionBwdFilter[dnnResourceDiffFilter] = gwmat.dptr_;
-     }
+     res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr(
+       gwmat.dptr_, bwdf_filter_diff, gwamt_mem);
      status = dnnExecute<DType>(convolutionBwdFilter, res_convolutionBwdFilter);
      CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status;
 #if MKL_EXPERIMENTAL == 0
@@ -459,30 +436,26 @@ class MKLConvolutionOp : public Operator {
        bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
      }
 #endif
+     if (req[1] == kAddTo) {
+       if (bwdf_filter_diff->conversion_needed()) {
+         bwdf_filter_diff->convert_from_prv(gwmat.dptr_);
+       }
+       AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size());
+     }
     }
     if (!param_.no_bias) {
       Tensor<xpu, 1, DType> gbias =
         mkl_experimental_direct_get<xpu, 1, DType>(in_grad[conv::kBias], s);
       void *res_convolutionBwdBias[dnnResourceNumber];
-      std::shared_ptr<MKLMemHolder> out_grad_mem =
-#if MKL_EXPERIMENTAL == 1
-        out_grad[conv::kOut].Mkl_mem_;
-#else
-        NULL;
-#endif
       res_convolutionBwdBias[dnnResourceDiffDst] =
-        bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad_mem);
-      if (bwdb_bias_diff->conversion_needed()) {
+        bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]);
+
+      std::shared_ptr<MKLMemHolder> gbias_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-        std::shared_ptr<MKLMemHolder> gbias_mem = in_grad[conv::kBias].Mkl_mem_;
-        gbias_mem->set_prv_descriptor(bwdb_bias_diff);
+      gbias_mem = in_grad[conv::kBias].Mkl_mem_;
 #endif
-        res_convolutionBwdBias[dnnResourceDiffBias] =
-          bwdb_bias_diff->prv_ptr();
-      } else {
-        res_convolutionBwdBias[dnnResourceDiffBias] =
-          reinterpret_cast<void *>(gbias.dptr_);
-      }
+      res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_,
+        bwdb_bias_diff, gbias_mem);
       status = dnnExecute<DType>(convolutionBwdBias, res_convolutionBwdBias);
       CHECK_EQ(status, 0) << "Backward Bias failed with status " << status;
 #if MKL_EXPERIMENTAL == 0
diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h
index dafa0d26393b..d313fd15a5be 100644
--- a/src/operator/mkl/mkl_elementwise_sum-inl.h
+++ b/src/operator/mkl/mkl_elementwise_sum-inl.h
@@ -38,10 +38,8 @@
 namespace mxnet {
 namespace op {
 template<typename xpu, typename DType>
-void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 4, DType> > &data,
-  const mshadow::Tensor<xpu, 4, DType> &out,
-  size_t data_shape_size, size_t num_bottoms,
-  std::vector< std::shared_ptr<MKLData<DType> > > *fwd_bottom_data_,
+static void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 1, DType> > &data,
+  size_t data_shape_size,
   std::shared_ptr<MKLData<DType> > fwd_top_data) {
   // Whether to use an asymptotically slower (for >2 inputs) but stabler method
   // of computing the gradient for the PROD operation. (No effect for SUM op.)
@@ -54,12 +52,6 @@ void LayerSetUp(const std::vector<mshadow::Tensor<xpu, 4, DType> > &data,
     strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1];
   }
 
-  for (size_t i = 0; i < num_bottoms; ++i) {
-    fwd_bottom_data_->push_back(MKLData<DType>::create());
-    CHECK_EQ(dim_src, data_shape_size);
-    (*fwd_bottom_data_)[i]->create_user_layout(dim_src, sizes_src, strides_src);
-  }
-
   fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src);
   delete[] sizes_src;
   delete[] strides_src;
@@ -74,149 +66,48 @@ void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   if (req[0] == kNullOp) return;
-  size_t size_ = in_data.size();
+  size_t size = in_data.size();
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  std::vector<Tensor<xpu, 4, DType> > data(size_);
-  Tensor<xpu, 4, DType> out;
-  if (in_data[0].ndim() == 1) {
-    for (int i = 0; i < size_; ++i) {
-      Shape<4> dshape = Shape4(in_data[i].shape_[0], 1, 1, 1);
-      data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[i], dshape, s);
-    }
-    Shape<4> dshape = Shape4(out_data[0].shape_[0], 1, 1, 1);
-    out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-      out_data[0], dshape, s);
-  } else if (in_data[0].ndim() == 2) {
-    for (int i = 0; i < size_; ++i) {
-      Shape<4> dshape = Shape4(in_data[i].shape_[0],
-        in_data[i].shape_[1], 1, 1);
-      data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[i], dshape, s);
-    }
-    Shape<4> dshape = Shape4(out_data[0].shape_[0],
-      out_data[0].shape_[1], 1, 1);
-    out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-      out_data[0], dshape, s);
-  } else if (in_data[0].ndim() == 3) {
-    for (int i = 0; i < size_; ++i) {
-      Shape<4> dshape = Shape4(in_data[i].shape_[0],
-        in_data[i].shape_[1], in_data[i].shape_[2], 1);
-      data[i] = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-        in_data[i], dshape, s);
-    }
-    Shape<4> dshape = Shape4(out_data[0].shape_[0],
-      out_data[0].shape_[1],
-      out_data[0].shape_[2], 1);
-    out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
-      out_data[0], dshape, s);
-  } else {
-    out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[0], s);
-    for (int i = 0; i < size_; ++i) {
-      data[i] = mkl_experimental_direct_get<xpu, 4, DType>(in_data[i], s);
+  std::vector<Tensor<xpu, 1, DType> > data(size);
+  Tensor<xpu, 1, DType> out = out_data[0].FlatTo1D<xpu, DType>(s);
+  bool in_place_flag = false;
+  int in_place_idx = 0;
+
+  for (size_t i = 0; i < size; ++i) {
+    data[i]  = in_data[i].FlatTo1D<xpu, DType>(s);
+    if (data[i].dptr_ == out.dptr_) {
+      in_place_idx = i;
+      in_place_flag = true;
     }
   }
-  std::vector< std::shared_ptr<MKLData<DType> > > fwd_bottom_data_;
   std::shared_ptr<MKLData<DType> > fwd_top_data = MKLData<DType>::create();
-  std::vector<DType> coeffs_  = std::vector<DType>(data.size(), 1);;
-  size_t num_bottoms = size_;
-  LayerSetUp(data, out, 4, num_bottoms, &fwd_bottom_data_, fwd_top_data);
-
-  dnnError_t e;
-  std::vector<void*> bottom_data;
-
-  int num_prv = 0;
-
-  for (size_t i = 0; i < num_bottoms; i++) {
-    void * i_data = NULL;
-#if MKL_EXPERIMENTAL == 1
-    i_data = reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[i]));
-    if (i_data != NULL) {
-      bottom_data.push_back(i_data);
-      num_prv += 1;
-    }
-#endif
-    if (i_data == NULL) {
-      bottom_data.push_back(reinterpret_cast<void *>(in_data[i].dptr_));
-    }
-  }
-  dnnPrimitive_t sumPrimitive = NULL;
-#if MKL_EXPERIMENTAL == 1
-  if (num_prv > 0) {
-    if (sumPrimitive == NULL) {
-      dnnLayout_t int_layout = NULL;
-      for (size_t i = 0; i < num_bottoms; ++i) {
-        if (mkl_prv_data<DType>(in_data[i]) != NULL) {
-          std::shared_ptr<MKLData<DType> > mem_descr =
-            mkl_get_mem_desc<DType>(in_data[i].Mkl_mem_);
-          fwd_bottom_data_[i] = mem_descr;
-          if (int_layout == NULL) {
-            int_layout = mem_descr->layout_int;
-          }
-        }
-      }
-      e = dnnSumCreate<DType>(&sumPrimitive, NULL,
-        num_bottoms, int_layout, &coeffs_[0]);
-      CHECK_EQ(e, E_SUCCESS);
+  std::vector<DType> coeffs_  = std::vector<DType>(data.size(), 1);
+  LayerSetUp(data, 1, fwd_top_data);
 
-      fwd_top_data->create_internal_layout(sumPrimitive, dnnResourceDst);
 
-      for (size_t i = 0; i < num_bottoms; ++i) {
-        if (mkl_prv_data<DType>(in_data[i]) == NULL) {
-          fwd_bottom_data_[i]->create_internal_layout(sumPrimitive,
-            (dnnResourceType_t)(dnnResourceMultipleSrc + i));
-        }
-      }
-    }
-  }
-#endif
-  if (num_prv == 0) {
-    if (sumPrimitive == NULL) {
-      e = dnnSumCreate<DType>(&sumPrimitive, NULL, num_bottoms,
-        fwd_top_data->layout_usr, &coeffs_[0]);
-      CHECK_EQ(e, E_SUCCESS);
-    }
-  }
+  dnnError_t e;
   void *eltwise_res[dnnResourceNumber];
-  for (size_t i = 0; i < num_bottoms; ++i) {
-    if (fwd_bottom_data_[i]->conversion_needed()) {
-      std::shared_ptr<MKLMemHolder> in_data_mem =
-#if MKL_EXPERIMENTAL == 1
-        in_data[i].Mkl_mem_;
-#else
-        NULL;
-#endif
-      eltwise_res[dnnResourceMultipleSrc + i] =
-        fwd_bottom_data_[i]->get_converted_prv(data[i].dptr_, false, in_data_mem);
-    } else {
-      eltwise_res[dnnResourceMultipleSrc + i] =
-        reinterpret_cast<void *>(bottom_data[i]);
-    }
-  }
+  dnnPrimitive_t sumPrimitive = NULL;
+  e = dnnSumCreate<DType>(&sumPrimitive, NULL, size, fwd_top_data->layout_usr,
+    &coeffs_[0]);
+  CHECK_EQ(e, E_SUCCESS);
 
-  if (fwd_top_data->conversion_needed()) {
-#if MKL_EXPERIMENTAL == 1
-    std::shared_ptr<MKLMemHolder> top_mem = out_data[0].Mkl_mem_;
-    if (top_mem->prv_data(false)) {
-      fwd_top_data = mkl_get_mem_desc<DType>(top_mem);
-    } else {
-      top_mem->set_prv_descriptor(fwd_top_data);
-    }
-#endif
-    eltwise_res[dnnResourceDst] =
-      reinterpret_cast<void*>(fwd_top_data->prv_ptr());
-  } else {
-    eltwise_res[dnnResourceDst] =
-      reinterpret_cast<void*>(const_cast<DType*>(out.dptr_));
+  eltwise_res[dnnResourceDst] = reinterpret_cast<void*>(const_cast<DType*>(out.dptr_));
+  eltwise_res[dnnResourceMultipleSrc] =
+    reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[in_place_idx].dptr_));
+  for (size_t i = 1; i < size; ++i) {
+    if (i == in_place_idx) continue;
+    eltwise_res[dnnResourceMultipleSrc + i] =
+      reinterpret_cast<void *>(reinterpret_cast<void *>(in_data[i].dptr_));
   }
 
   e = dnnExecute<DType>(sumPrimitive, eltwise_res);
   CHECK_EQ(e, E_SUCCESS);
-#if MKL_EXPERIMENTAL == 0
-  if (fwd_top_data->conversion_needed()) {
-    fwd_top_data->convert_from_prv(out.dptr_);
+
+  if (sumPrimitive != NULL) {
+    dnnDelete<DType>(sumPrimitive);
+    sumPrimitive = NULL;
   }
-#endif
 }
 
 
diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h
index ced5cb010065..a24712f95bab 100644
--- a/src/operator/mkl/mkl_fully_connected-inl.h
+++ b/src/operator/mkl/mkl_fully_connected-inl.h
@@ -34,17 +34,11 @@ namespace op {
 template<typename xpu, typename DType>
 class MKLFullyConnectedOp : public Operator {
  public:
-  explicit MKLFullyConnectedOp(FullyConnectedParam p):
-    init_mkldnn_(false),
-    fullyConnectedFwd(NULL),
-    fullyConnectedBwdData(NULL),
-    fullyConnectedBwdFilter(NULL),
-    fullyConnectedBwdBias(NULL) {
-    param_ = p;
-    fwd_top_data = MKLData<DType>::create();
-    fwd_bottom_data = MKLData<DType>::create();
-    bwd_bottom_diff = MKLData<DType>::create();
-    bwd_top_diff = MKLData<DType>::create();
+  explicit MKLFullyConnectedOp(const FullyConnectedParam& p,
+                               const std::vector<TShape>& in_shapes,
+                               const std::vector<TShape>& out_shapes):
+    param_(p) {
+    LayerSetUp(in_shapes, out_shapes);
   }
 
   ~MKLFullyConnectedOp() {
@@ -58,80 +52,52 @@ class MKLFullyConnectedOp : public Operator {
   }
 
  private:
-    void LayerSetUp(const mshadow::Tensor<xpu, 4, DType> &data,
-                   const mshadow::Tensor<xpu, 4, DType> &out) {
-    size_t src_sizes[4];
-    size_t dst_sizes[2];
+  void LayerSetUp(const std::vector<TShape>& in_shapes,
+                  const std::vector<TShape>& out_shapes) {
+    const TShape& ishape = in_shapes[fullc::kData];
 
-    size_t dim = 4;
-    int status;
-    const size_t input_batch_size = data.size(0);
-    const size_t input_channels = data.size(1);
-    const size_t input_height = data.size(2);
-    const size_t input_width = data.size(3);
-
-    const size_t output_batch_size = out.size(0);
-    const size_t output_channels = out.size(1);
-
-    src_sizes[0] = input_width;
-    src_sizes[1] = input_height;
-    src_sizes[2] = input_channels;
-    src_sizes[3] = input_batch_size;
-
-    dst_sizes[0] = output_channels;
-    dst_sizes[1] = output_batch_size;
-
-    // Names are for debugging only
-    fwd_bottom_data->name = "fwd_bottom_data   @ " + getName();
-    fwd_top_data->name = "fwd_top_data      @ " + getName();
-    bwd_bottom_diff->name = "bwd_bottom_diff   @ " + getName();
-    bwd_top_diff->name = "bwd_top_diff      @ " + getName();
+    const size_t dim = 4;
+    const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]};
+    const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]};
+    const size_t output_channels = param_.num_hidden;
 
     dnnPrimitiveAttributes_t attributes = NULL;
-    status = dnnPrimitiveAttributesCreate<DType>(&attributes);
-    CHECK_EQ(status, 0);
+    MKLDNN_CALL(dnnPrimitiveAttributesCreate<DType>(&attributes));
     if (!param_.no_bias) {
-      status = dnnInnerProductCreateForwardBias<DType>(&fullyConnectedFwd,
-                  attributes,
-                  dim,
-                  src_sizes,
-                  output_channels);
-      CHECK_EQ(status, 0)
-      << "Failed dnnInnerProductCreateForwardBias with status "
-      << status << "\n";
-    } else {
-      status = dnnInnerProductCreateForward<DType>(&fullyConnectedFwd,
-                attributes,
-                dim,
-                src_sizes,
-                output_channels);
-      CHECK_EQ(status, 0)
-      << "Failed dnnInnerProductCreateForward with status "
-      << status << "\n";
-    }
-    status = dnnInnerProductCreateBackwardData<DType>(&fullyConnectedBwdData,
+      MKLDNN_CALL(dnnInnerProductCreateForwardBias<DType>(
+            &fullyConnectedFwd,
             attributes,
             dim,
             src_sizes,
-            output_channels);
-    CHECK_EQ(status, 0)
-      << "Failed dnnInnerProductCreateBackwardData with status "
-      << status << "\n";
-    status = dnnInnerProductCreateBackwardFilter<DType>(&fullyConnectedBwdFilter,
+            output_channels));
+    } else {
+      MKLDNN_CALL(dnnInnerProductCreateForward<DType>(
+            &fullyConnectedFwd,
             attributes,
             dim,
             src_sizes,
-            output_channels);
-    CHECK_EQ(status, 0)
-      << "Failed dnnInnerProductCreateBackwardFilter with status "
-      << status << "\n";
+            output_channels));
+    }
+    MKLDNN_CALL(dnnInnerProductCreateBackwardData<DType>(
+          &fullyConnectedBwdData,
+          attributes,
+          dim,
+          src_sizes,
+          output_channels));
+    MKLDNN_CALL(dnnInnerProductCreateBackwardFilter<DType>(
+          &fullyConnectedBwdFilter,
+          attributes,
+          dim,
+          src_sizes,
+          output_channels));
     if (!param_.no_bias) {
-      status = dnnInnerProductCreateBackwardBias<DType>(&fullyConnectedBwdBias,
-                  attributes,
-                  2,
-                  dst_sizes);
-      CHECK_EQ(status, 0) << "Backward Bias failed with status " << status;
+      MKLDNN_CALL(dnnInnerProductCreateBackwardBias<DType>(
+            &fullyConnectedBwdBias,
+            attributes,
+            2,
+            dst_sizes));
     }
+    // TODO(minjie): Shouldn't `attributes` be destroyed?
   }
 
 
@@ -143,32 +109,12 @@ class MKLFullyConnectedOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
 
+    void* res_fullyConnected[dnnResourceNumber];
     if (req[fullc::kOut] == kNullOp) return;
     CHECK_EQ(req[fullc::kOut], kWriteTo);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3);
     CHECK_EQ(out_data.size(), 1);
-    int status;
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    const TShape& ishape = in_data[fullc::kData].shape_;
-    const TShape& oshape = out_data[fullc::kOut].shape_;
-
-    Tensor<xpu, 4, DType> data;
-    Tensor<xpu, 4, DType> out;
-
-    Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1);
-
-    Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1);
-    Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1);
-
-    data = in_data[fullc::kData].get_with_shape<xpu, 4, DType>(dshape, s);
-    out = out_data[fullc::kOut].get_with_shape<xpu, 4, DType>(odshape, s);
-
-    if (!init_mkldnn_) {
-      LayerSetUp(data, out);
-      init_mkldnn_ = true;
-    }
-
     res_fullyConnected[dnnResourceSrc] =
       reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
     res_fullyConnected[dnnResourceDst] =
@@ -179,8 +125,7 @@ class MKLFullyConnectedOp : public Operator {
       res_fullyConnected[dnnResourceBias] = reinterpret_cast<void *>(in_data[fullc::kBias].dptr_);
     }
 
-    status = dnnExecute<DType>(fullyConnectedFwd, res_fullyConnected);
-    CHECK_EQ(status, 0) << "Forward FC failed with status " << status;
+    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedFwd, res_fullyConnected));
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -193,11 +138,11 @@ class MKLFullyConnectedOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
 
+    void* res_fullyConnected[dnnResourceNumber];
     CHECK_EQ(out_grad.size(), 1);
-    size_t expected = param_.no_bias ? 2 : 3;
+    const size_t expected = param_.no_bias ? 2 : 3;
     CHECK(in_data.size() == expected && in_grad.size() == expected);
     CHECK_EQ(req.size(), expected);
-    int status;
     res_fullyConnected[dnnResourceSrc] =
       reinterpret_cast<void *>(in_data[fullc::kData].dptr_);
     res_fullyConnected[dnnResourceFilter] =
@@ -213,26 +158,19 @@ class MKLFullyConnectedOp : public Operator {
       res_fullyConnected[dnnResourceDiffBias] =
         reinterpret_cast<void *>(in_grad[fullc::kBias].dptr_);
     }
-    status = dnnExecute<DType>(fullyConnectedBwdFilter, res_fullyConnected);
-    CHECK_EQ(status, 0) << "Backward FC Filter failed with status " << status;
+    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdFilter, res_fullyConnected));
     if (!param_.no_bias) {
-      status = dnnExecute<DType>(fullyConnectedBwdBias, res_fullyConnected);
-      CHECK_EQ(status, 0) << "Backward FC Bias failed with status " << status;
+      MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdBias, res_fullyConnected));
     }
-    status = dnnExecute<DType>(fullyConnectedBwdData, res_fullyConnected);
-    CHECK_EQ(status, 0) << "Backward FC Data failed with status " << status;
+    MKLDNN_CALL(dnnExecute<DType>(fullyConnectedBwdData, res_fullyConnected));
   }
 
  private:
-  bool init_mkldnn_;
-  dnnPrimitive_t fullyConnectedFwd;
-  dnnPrimitive_t fullyConnectedBwdData;
-  dnnPrimitive_t fullyConnectedBwdFilter;
-  dnnPrimitive_t fullyConnectedBwdBias;
-  std::shared_ptr<MKLData<DType>> fwd_top_data, fwd_bottom_data;
-  std::shared_ptr<MKLData<DType>> bwd_bottom_diff, bwd_top_diff;
-  FullyConnectedParam param_;
-  void* res_fullyConnected[dnnResourceNumber];
+  dnnPrimitive_t fullyConnectedFwd{nullptr};
+  dnnPrimitive_t fullyConnectedBwdData{nullptr};
+  dnnPrimitive_t fullyConnectedBwdFilter{nullptr};
+  dnnPrimitive_t fullyConnectedBwdBias{nullptr};
+  const FullyConnectedParam param_;
 };  // class MKLFullyConnectedOp
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h
index 1ed1de802441..a7ae1ad75970 100644
--- a/src/operator/mkl/mkl_lrn-inl.h
+++ b/src/operator/mkl/mkl_lrn-inl.h
@@ -109,9 +109,9 @@ class MKLLRNOp : public Operator {
                        const std::vector<TBlob> &aux_states) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 2);
-    CHECK_EQ(param_.nsize % 2, 1) << "LRN only supports odd values for local_size";
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 2U);
+    CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> data = mkl_experimental_direct_get<xpu, 4, DType>(
       in_data[lrn_enum::kData], s);
@@ -195,19 +195,13 @@ class MKLLRNOp : public Operator {
     dnnError_t e;
     void* lrn_res[dnnResourceNumber];
     lrn_res[dnnResourceSrc] = const_cast<void*>(bottom_data);
+
+    std::shared_ptr<MKLMemHolder> top_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-    if (fwd_top_data_->conversion_needed()) {
-      std::shared_ptr<MKLMemHolder> top_mem = out_data[lrn_enum::kOut].Mkl_mem_;
-      lrn_res[dnnResourceDst] =
-        reinterpret_cast<void *>(fwd_top_data_->prv_ptr());
-      top_mem->set_prv_descriptor(fwd_top_data_);
-    } else {
-#endif
-    lrn_res[dnnResourceDst] =
-      reinterpret_cast<void *>(out.dptr_);
-#if MKL_EXPERIMENTAL == 1
-    }
+    top_mem = out_data[lrn_enum::kOut].Mkl_mem_;
 #endif
+    lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
+      out.dptr_, fwd_top_data_, top_mem);
     lrn_res[dnnResourceWorkspace] = lrn_buffer_;
     e = dnnExecute<DType>(lrnFwd, lrn_res);
     CHECK_EQ(e, E_SUCCESS);
@@ -234,35 +228,18 @@ class MKLLRNOp : public Operator {
       in_grad[lrn_enum::kData], s);
     dnnError_t e;
     void* lrn_res[dnnResourceNumber];
-    std::shared_ptr<MKLMemHolder> top_diff_mem =
-#if MKL_EXPERIMENTAL == 1
-      out_grad[lrn_enum::kOut].Mkl_mem_;
-#else
-      NULL;
-#endif
     lrn_res[dnnResourceDiffDst] =
-      bwd_top_diff_->get_converted_prv(grad.dptr_, true, top_diff_mem);
-
+      bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]);
     lrn_res[dnnResourceWorkspace] = lrn_buffer_;
-
     lrn_res[dnnResourceSrc] =
-      fwd_bottom_data_->get_converted_prv(data.dptr_, false);
-    std::shared_ptr<MKLMemHolder> bottom_diff_mem =
-#if MKL_EXPERIMENTAL == 1
-      in_grad[lrn_enum::kData].Mkl_mem_;
-#else
-      NULL;
-#endif
-#if MKL_EXPERIMENTAL == 1
-    if (bwd_bottom_diff_->conversion_needed()) {
-      lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->prv_ptr();
-      bottom_diff_mem->set_prv_descriptor(bwd_bottom_diff_);
-    } else {
-#endif
-    lrn_res[dnnResourceDiffSrc] = grad_in.dptr_;
+      fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]);
+
+    std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-    }
+    bottom_diff_mem = in_grad[lrn_enum::kData].Mkl_mem_;
 #endif
+    lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
+      grad_in.dptr_, bwd_bottom_diff_, bottom_diff_mem);
     e = dnnExecute<DType>(lrnBwd, lrn_res);
     CHECK_EQ(e, E_SUCCESS);
   }
diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h
index b9895d537f68..2fb9032b1d2d 100644
--- a/src/operator/mkl/mkl_memory-inl.h
+++ b/src/operator/mkl/mkl_memory-inl.h
@@ -22,6 +22,7 @@
 #ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
 #define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_
 
+
 #include <string>
 #include <vector>
 #include <memory>
@@ -42,9 +43,18 @@ struct MKLMemoryDescriptorBase : public PrvMemDescr,
       dnnReleaseBuffer<DType>(internal_ptr);
       internal_ptr = NULL;
     }
-    dnnDelete<DType>(convert_to_int);
-    dnnDelete<DType>(convert_from_int);
-    dnnDelete<DType>(convert_prv2prv);
+    if (convert_to_int != NULL) {
+      dnnDelete<DType>(convert_to_int);
+      convert_to_int = NULL;
+    }
+    if (convert_from_int != NULL) {
+      dnnDelete<DType>(convert_from_int);
+      convert_from_int = NULL;
+    }
+    if (convert_prv2prv != NULL) {
+      dnnDelete<DType>(convert_prv2prv);
+      convert_prv2prv = NULL;
+    }
   }
   std::shared_ptr<MKLMemoryDescriptorBase<DType> > get_shared_ptr() {
     return this->shared_from_this();
@@ -107,7 +117,10 @@ struct MKLMemoryDescriptor : MKLMemoryDescriptorBase<DType> {
   // The last get_converted_prv() argument is a hack for reusing
   // in backward a conversion done already in the forward direction.
   DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr,
-              std::shared_ptr<MKLMemHolder> dnn_chunk = NULL);
+      const TBlob &blob);
+  void* get_output_ptr(DType *data_ptr,
+    std::shared_ptr<MKLMemoryDescriptor<DType> > self_ptr,
+    std::shared_ptr<MKLMemHolder> dnn_chunk = NULL);
   bool copy_from(std::shared_ptr<MKLMemHolder> dnn_chunk);
   MKLMemoryDescriptor() {}
 };
diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc
index 9730825475aa..13286e06fc36 100644
--- a/src/operator/mkl/mkl_memory.cc
+++ b/src/operator/mkl/mkl_memory.cc
@@ -34,10 +34,12 @@ void MKLMemoryDescriptorBase<Dtype>::create_conversions() {
   if (this->convert_from_int) {
     status = dnnDelete<Dtype>(this->convert_from_int);
     CHECK_EQ(status, E_SUCCESS);
+    this->convert_from_int = NULL;
   }
   if (this->convert_to_int) {
     status = dnnDelete<Dtype>(this->convert_to_int);
     CHECK_EQ(status, E_SUCCESS);
+    this->convert_to_int = NULL;
   }
   if (layout_int
       && !dnnLayoutCompare<Dtype>(layout_usr, layout_int)) {
@@ -168,9 +170,12 @@ void MKLMemoryDescriptorBase<Dtype>::convert_from_other(
 
 template <typename Dtype>
 Dtype* MKLMemoryDescriptor<Dtype>::get_converted_prv(
-    Dtype *cpu_ptr, bool set_prv_ptr,
-    std::shared_ptr<MKLMemHolder> dnn_chunk) {
+    Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) {
   Dtype* prv_ptr = NULL;
+  std::shared_ptr<MKLMemHolder> dnn_chunk = NULL;
+#if MKL_EXPERIMENTAL == 1
+  dnn_chunk = blob.Mkl_mem_;
+#endif
 #if MKL_EXPERIMENTAL == 1
   if (dnn_chunk != NULL)
     prv_ptr = static_cast<Dtype*>(dnn_chunk->prv_data());
@@ -234,9 +239,41 @@ Dtype* MKLMemoryDescriptor<Dtype>::get_converted_prv(
     }
 #endif
     return const_cast<Dtype *>(prv_ptr);
+  } else {
+    if (prv_ptr != NULL) {
+#if MKL_EXPERIMENTAL == 1
+      std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
+        std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
+        (dnn_chunk->prv_descriptor_);
+      dnn_chunk->check_and_prv_to_cpu(cpu_ptr);
+#endif
+      // printf("get_converted_prv release %s\n", other_descr->name.c_str());
+    }
   }
   return cpu_ptr;
 }
+
+template <typename Dtype>
+void* MKLMemoryDescriptor<Dtype>::get_output_ptr(Dtype *data_ptr,
+  std::shared_ptr<MKLMemoryDescriptor<Dtype> > self_ptr,
+  std::shared_ptr<MKLMemHolder> dnn_chunk) {
+  if (this->conversion_needed()) {
+    void * prv_ptr =  this->prv_ptr();
+#if MKL_EXPERIMENTAL == 1
+    dnn_chunk->set_prv_descriptor(self_ptr);
+#endif
+    return prv_ptr;
+  } else {
+#if MKL_EXPERIMENTAL == 1
+    std::shared_ptr<MKLMemoryDescriptorBase<float> > other_descr =
+      std::static_pointer_cast<MKLMemoryDescriptorBase<float> >
+      (dnn_chunk->prv_descriptor_);
+    dnn_chunk->check_and_prv_to_cpu(data_ptr);
+#endif
+    return data_ptr;
+  }
+}
+
 template class MKLMemoryDescriptor<double>;
 template class MKLMemoryDescriptor<float>;
 
diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h
index 0757954a1422..0f813bc4b3d9 100644
--- a/src/operator/mkl/mkl_pooling-inl.h
+++ b/src/operator/mkl/mkl_pooling-inl.h
@@ -31,15 +31,6 @@
 
 namespace mxnet {
 namespace op {
-bool UseMKLPooling(PoolingParam param) {
-  if (param.kernel.ndim() != 2) {
-    return false;
-  }
-  if (param.pooling_convention == pool_enum::kFull) {
-    return true;
-  }
-  return false;
-}
 
 
 template<typename xpu, typename DType>
@@ -206,19 +197,31 @@ class MKLPoolingOp : public Operator {
     bottom_data =
           reinterpret_cast<void *>(mkl_prv_data<DType>(in_data[pool_enum::kData]));
 #endif
+    dnnBorder_t border_type = dnnBorderZerosAsymm;
+    switch (param_.pooling_convention) {
+    case pool_enum::kFull:
+      border_type = dnnBorderZeros;
+      break;
+    case pool_enum::kValid:
+      border_type = dnnBorderZerosAsymm;
+      break;
+    default:
+      border_type = dnnBorderZerosAsymm;
+      break;
+    }
     if (NULL == bottom_data) {
       bottom_data = data.dptr_;
       if (NULL == poolingFwd) {
         status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
                                                 algorithm, fwd_bottom_data->layout_usr,
                                                 kernel_size, kernel_stride,
-                                                src_offset, dnnBorderZerosAsymm);
+                                                src_offset, border_type);
       CHECK_EQ(status, E_SUCCESS);
       // Now create poolingBwd
       status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
                                                algorithm, fwd_bottom_data->layout_usr,
                                                kernel_size, kernel_stride,
-                                               src_offset, dnnBorderZerosAsymm);
+                                               src_offset, border_type);
       CHECK_EQ(status, E_SUCCESS);
       }
     }
@@ -238,7 +241,7 @@ class MKLPoolingOp : public Operator {
           status = dnnPoolingCreateForward<DType>(&poolingFwd, NULL,
                                                   algorithm, fwd_bottom_data->layout_int,
                                                   kernel_size, kernel_stride,
-                                                  src_offset, dnnBorderZerosAsymm);
+                                                  src_offset, border_type);
           CHECK_EQ(status, E_SUCCESS);
           fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst);
 
@@ -246,7 +249,7 @@ class MKLPoolingOp : public Operator {
           status = dnnPoolingCreateBackward<DType>(&poolingBwd, NULL,
                                                    algorithm, fwd_bottom_data->layout_int,
                                                    kernel_size, kernel_stride,
-                                                   src_offset, dnnBorderZerosAsymm);
+                                                   src_offset, border_type);
           CHECK_EQ(status, E_SUCCESS);
           bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst);
           bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc);
@@ -272,15 +275,13 @@ class MKLPoolingOp : public Operator {
     }
     pooling_res[dnnResourceSrc] = bottom_data;
     pooling_res[dnnResourceWorkspace] = max_idx_data;
-    if (fwd_top_data->conversion_needed()) {
+
+    std::shared_ptr<MKLMemHolder> top_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-    std::shared_ptr<MKLMemHolder> top_mem = out_data[pool_enum::kOut].Mkl_mem_;
-      top_mem->set_prv_descriptor(fwd_top_data, true);
+    top_mem = out_data[pool_enum::kOut].Mkl_mem_;
 #endif
-      pooling_res[dnnResourceDst] = reinterpret_cast<void *>(fwd_top_data->prv_ptr());
-    } else {
-      pooling_res[dnnResourceDst] = reinterpret_cast<void *>(out.dptr_);
-    }
+    pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr(
+      out.dptr_, fwd_top_data, top_mem);
     status = dnnExecute<DType>(poolingFwd, pooling_res);
     CHECK_EQ(status, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
@@ -318,23 +319,15 @@ class MKLPoolingOp : public Operator {
     void* pooling_res[dnnResourceNumber];
     pooling_res[dnnResourceWorkspace] = reinterpret_cast<void *>(max_idx_data);
 
-    std::shared_ptr<MKLMemHolder> top_diff_mem =
-#if MKL_EXPERIMENTAL == 1
-      out_grad[pool_enum::kOut].Mkl_mem_;
-#else
-      NULL;
-#endif
     pooling_res[dnnResourceDiffDst] =
-      bwd_top_diff->get_converted_prv(grad.dptr_, true, top_diff_mem);
-    if (bwd_bottom_diff->conversion_needed()) {
-      pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->prv_ptr();
+      bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]);
+
+    std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemHolder> bottom_diff_mem = in_grad[pool_enum::kData].Mkl_mem_;
-      bottom_diff_mem->set_prv_descriptor(bwd_bottom_diff);
+    bottom_diff_mem = in_grad[pool_enum::kData].Mkl_mem_;
 #endif
-    } else {
-      pooling_res[dnnResourceDiffSrc] = input_grad.dptr_;
-    }
+    pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(
+      input_grad.dptr_, bwd_bottom_diff, bottom_diff_mem);
     e = dnnExecute<DType>(poolingBwd, pooling_res);
     CHECK_EQ(e, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h
index 8aba2db54ed2..d09e3f08496e 100644
--- a/src/operator/mkl/mkl_relu-inl.h
+++ b/src/operator/mkl/mkl_relu-inl.h
@@ -107,6 +107,14 @@ class MKLReluOp : public Operator {
         in_data[activation::kData], dshape, s);
       out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
         out_data[activation::kOut], dshape, s);
+    } else if (in_data[activation::kData].ndim() == 3) {
+      Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0],
+        in_data[activation::kData].shape_[1],
+        in_data[activation::kData].shape_[2], 1);
+      data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[activation::kData], dshape, s);
+      out = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_data[activation::kOut], dshape, s);
     } else {
       data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[activation::kData], s);
       out = mkl_experimental_direct_get<xpu, 4, DType>(out_data[activation::kOut], s);
@@ -158,21 +166,13 @@ class MKLReluOp : public Operator {
     dnnError_t e;
     void* relu_res[dnnResourceNumber];
     relu_res[dnnResourceSrc] = bottom_data;
-    if (fwd_top_data_->conversion_needed()) {
+
+    std::shared_ptr<MKLMemHolder> top_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemHolder> top_mem = out_data[activation::kOut].Mkl_mem_;
-      if (top_mem->prv_data(false)) {
-        fwd_top_data_ = mkl_get_mem_desc<DType>(top_mem);
-      } else {
-        top_mem->set_prv_descriptor(fwd_top_data_);
-      }
+    top_mem = out_data[activation::kOut].Mkl_mem_;
 #endif
-      relu_res[dnnResourceDst] =
-        reinterpret_cast<void *>(fwd_top_data_->prv_ptr());
-    } else {
-      relu_res[dnnResourceDst] =
-      reinterpret_cast<void *>(out.dptr_);
-    }
+    relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(
+      out.dptr_, fwd_top_data_, top_mem);
     e = dnnExecute<DType>(reluFwd_, relu_res);
     CHECK_EQ(e, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
@@ -211,6 +211,16 @@ class MKLReluOp : public Operator {
         in_data[activation::kData], dshape, s);
       m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
         in_grad[activation::kData], dshape, s);
+    } else if (out_grad[activation::kOut].ndim() == 3) {
+      Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0],
+        out_grad[activation::kOut].shape_[1],
+        out_grad[activation::kOut].shape_[2], 1);
+      m_out_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        out_grad[activation::kOut], dshape, s);
+      m_in_data = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_data[activation::kData], dshape, s);
+      m_in_grad = mkl_experimental_direct_get_with_shape<xpu, 4, DType>(
+        in_grad[activation::kData], dshape, s);
     } else {
       m_out_grad = mkl_experimental_direct_get<xpu, 4, DType>(out_grad[activation::kOut], s);
       m_in_data = mkl_experimental_direct_get<xpu, 4, DType>(in_data[activation::kData], s);
@@ -229,29 +239,14 @@ class MKLReluOp : public Operator {
         reinterpret_cast<void *>(const_cast<DType*>(m_in_data.dptr_));
     }
     relu_res[dnnResourceSrc] = bottom_data;
-
-    std::shared_ptr<MKLMemHolder> top_diff_mem =
-#if MKL_EXPERIMENTAL == 1
-      out_grad[activation::kOut].Mkl_mem_;
-#else
-      NULL;
-#endif
     relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_,
-                                                                    true, top_diff_mem);
-    if (bwd_bottom_diff_->conversion_needed()) {
+                true, out_grad[activation::kOut]);
+    std::shared_ptr<MKLMemHolder> bottom_diff_mem = NULL;
 #if MKL_EXPERIMENTAL == 1
-      std::shared_ptr<MKLMemHolder> bottom_diff_mem = in_grad[activation::kData].Mkl_mem_;
-      // Need to check in_grad[activation::kData]  is the same with out_grad[activation::kOut]
-      if (bottom_diff_mem->prv_data(false)) {
-        bwd_bottom_diff_ = mkl_get_mem_desc<DType>(bottom_diff_mem);
-      } else {
-        bottom_diff_mem->set_prv_descriptor(bwd_bottom_diff_);
-      }
+    bottom_diff_mem = in_grad[activation::kData].Mkl_mem_;
 #endif
-      relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->prv_ptr();
-    } else {
-      relu_res[dnnResourceDiffSrc] = m_in_grad.dptr_;
-    }
+    relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr(
+      m_in_grad.dptr_, bwd_bottom_diff_, bottom_diff_mem);
     e = dnnExecute<DType>(reluBwd_, relu_res);
     CHECK_EQ(e, E_SUCCESS);
 #if MKL_EXPERIMENTAL == 0
diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h
index 800b6122c433..426688e9b57f 100644
--- a/src/operator/mkl/mkl_util-inl.h
+++ b/src/operator/mkl/mkl_util-inl.h
@@ -21,6 +21,14 @@
 *******************************************************************************/
 #ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
 #define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_
+
+#define MKLDNN_CALL(func)                                                               \
+  {                                                                                     \
+    dnnError_t status = (func);                                                                \
+    CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ").";           \
+  }
+
+
 namespace mxnet {
 namespace op {
 
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index bc45c4e23e67..0a47db1a9b2b 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -599,7 +599,7 @@ struct rint {
   MSHADOW_XINLINE static DType Map(DType a) {
     float floor = floorf(a);
     float ceil = ceilf(a);
-    return DType((floor - a) < (ceil - a) ? floor : ceil);
+    return DType((a - floor) <= (ceil - a) ? floor : ceil);
   }
 };
 
@@ -609,7 +609,7 @@ struct fix {
   MSHADOW_XINLINE static DType Map(DType a) {
     float floor = floorf(a);
     float ceil = ceilf(a);
-    return DType((floor - 0) < (ceil - 0) ? floor : ceil);
+    return DType((floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil);
   }
 };
 
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 8837f94435b1..9b5dcfe3d3b1 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -13,6 +13,8 @@
 namespace mxnet {
 namespace op {
 namespace mxnet_op {
+using namespace mshadow;
+
 #ifdef __CUDA_ARCH__
 __constant__ const float PI = 3.14159265358979323846;
 #else
@@ -21,43 +23,23 @@ using std::isnan;
 #endif
 
 
-template<typename OP, typename xpu>
-struct Kernel;
+#ifdef __CUDACC__
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+      i < (n); \
+      i += blockDim.x * gridDim.x)
 
-template<typename OP>
-struct Kernel<OP, cpu> {
-  template<typename ...Args>
-  inline static void Launch(mshadow::Stream<cpu> *s, int N, Args... args) {
-#if (MXNET_USE_CUDA == 0)
-    #pragma omp parallel for
-#endif
-    for (int i = 0; i < N; ++i) {
-      OP::Map(i, args...);
-    }
-  }
-};
 
-#ifdef __CUDACC__
-template<typename OP, typename ...Args>
-__global__ void mxnet_generic_kernel(int N, Args... args) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
-    OP::Map(i, args...);
-  }
+/*!
+ * \brief Get the number of blocks for cuda kernel given N
+ */
+inline int cuda_get_num_blocks(const int N) {
+  using namespace mshadow::cuda;
+  return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
 }
-
-template<typename OP>
-struct Kernel<OP, gpu> {
-  template<typename ...Args>
-  inline static void Launch(mshadow::Stream<gpu> *s, int N, Args... args) {
-    using namespace mshadow::cuda;
-    int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
-    mxnet_generic_kernel<OP, Args...>
-      <<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
-        N, args...);
-  }
-};
 #endif  // __CUDACC__
 
+
 /*! \brief operator request type switch */
 #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...)  \
   switch (req) {                                    \
@@ -80,6 +62,7 @@ struct Kernel<OP, gpu> {
     break;                                          \
   }
 
+
 /*!
  * \brief assign the val to out according
  * to request in Kernel::Launch
@@ -106,36 +89,130 @@ struct Kernel<OP, gpu> {
     }                                 \
   }
 
-struct clip {
+
+/* \brief Compute flattened index given coordinates and shape. */
+template<int ndim>
+MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
+  int ret = 0;
+  #pragma unroll
+  for (int i = 0; i < ndim; ++i) {
+    ret = ret * shape[i] + (shape[i] > coord[i]) * coord[i];
+  }
+  return ret;
+}
+
+
+/* Compute coordinates from flattened index given shape */
+template<int ndim>
+MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
+  Shape<ndim> ret;
+  #pragma unroll
+  for (int i = ndim-1, j = idx; i >=0; --i) {
+    int tmp = j / shape[i];
+    ret[i] = j - tmp*shape[i];
+    j = tmp;
+  }
+  return ret;
+}
+
+
+/* Compute dot product of two vector */
+template<int ndim>
+MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
+  int ret = 0;
+  #pragma unroll
+  for (int i = 0; i < ndim; ++i)
+    ret += coord[i] * stride[i];
+  return ret;
+}
+
+
+/* Combining unravel and dot */
+template<int ndim>
+MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
+  const Shape<ndim>& stride) {
+  int ret = 0;
+  #pragma unroll
+  for (int i = ndim-1, j = idx; i >=0; --i) {
+    int tmp = j / shape[i];
+    ret += (j - tmp*shape[i])*stride[i];
+    j = tmp;
+  }
+  return ret;
+}
+
+
+/* Calculate stride of each dim from shape */
+template<int ndim>
+MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
+  Shape<ndim> stride;
+  index_t cumprod = 1;
+  #pragma unroll
+  for (int i = ndim - 1; i >= 0; --i) {
+    stride[i] = (shape[i] > 1) ? cumprod : 0;
+    cumprod *= shape[i];
+  }
+  return stride;
+}
+
+
+struct fill {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* datas,
-                                  DType a_min, DType a_max) {
-    DType data = datas[i];
-    if (data > a_max) {
-      out[i] = a_max;
-    } else if (data < a_min) {
-      out[i] = a_min;
-    } else {
-      out[i] = data;
-    }
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val) {
+    out[i] = val;
   }
 };
 
-struct clip_grad {
+
+struct set_zero {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* grad, const DType* datas,
-                                  DType a_min, DType a_max) {
-    DType data = datas[i];
-    if (data > a_max) {
-      out[i] = 0;
-    } else if (data < a_min) {
-      out[i] = 0;
-    } else {
-      out[i] = grad[i];
+  MSHADOW_XINLINE static void Map(int i, DType* out) {
+    out[i] = static_cast<DType>(0);
+  }
+};
+
+
+template<typename OP, typename xpu>
+struct Kernel;
+
+
+template<typename OP>
+struct Kernel<OP, cpu> {
+  template<typename ...Args>
+  inline static void Launch(mshadow::Stream<cpu> *s, int N, Args... args) {
+#if (MXNET_USE_CUDA == 0)
+    #pragma omp parallel for
+#endif
+    for (int i = 0; i < N; ++i) {
+      OP::Map(i, args...);
     }
   }
 };
 
+
+#ifdef __CUDACC__
+template<typename OP, typename ...Args>
+__global__ void mxnet_generic_kernel(int N, Args... args) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+    OP::Map(i, args...);
+  }
+}
+
+
+template<typename OP>
+struct Kernel<OP, gpu> {
+  template<typename ...Args>
+  inline static void Launch(mshadow::Stream<gpu> *s, int N, Args... args) {
+    using namespace mshadow::cuda;
+    int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
+    mxnet_generic_kernel<OP, Args...>
+      <<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        N, args...);
+  }
+};
+#endif  // __CUDACC__
+
+
 }  // namespace mxnet_op
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/nn/im2col.cuh b/src/operator/nn/im2col.cuh
new file mode 100644
index 000000000000..786fd22f8c9b
--- /dev/null
+++ b/src/operator/nn/im2col.cuh
@@ -0,0 +1,519 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ * 
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * 
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ * 
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * 
+ * LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met: 
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution. 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * CONTRIBUTION AGREEMENT
+ * 
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, and dilation.
+ * These functions are mainly used in convolution operators.
+ * The implementation of the im2col and col2im algorithms
+ * are copied from Caffe with minor interface modifications
+ * adapting to MXNet data structures.
+ */
+
+#ifndef MXNET_OPERATOR_NN_IM2COL_CUH_
+#define MXNET_OPERATOR_NN_IM2COL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <cstring>
+#include <vector>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief im2col gpu kernel.
+ * DO NOT call this directly. Use wrapper function im2col() instead;
+ */
+template <typename DType>
+__global__ void im2col_gpu_kernel(const int n, const DType* data_im,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int height_col, const int width_col,
+    DType* data_col) {
+  CUDA_KERNEL_LOOP(index, n) {
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+    DType* data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const DType* data_im_ptr = data_im;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h_im = h_offset + i * dilation_h;
+        int w_im = w_offset + j * dilation_w;
+        *data_col_ptr =
+            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+            data_im_ptr[i * dilation_h * width + j * dilation_w] : static_cast<DType>(0);
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*!
+ * \brief DO NOT call this directly. Use wrapper function im2col() instead;
+ */
+template <typename DType>
+inline void im2col_gpu(mshadow::Stream<gpu>* s,
+                       const DType* data_im, const int channels,
+                       const int height, const int width,
+                       const int kernel_h, const int kernel_w,
+                       const int pad_h, const int pad_w,
+                       const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       DType* data_col) {
+  // We are going to launch channels * height_col * width_col kernels, each
+  // kernel responsible for copying a single-channel grid.
+  int height_col = (height + 2 * pad_h -
+      (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w -
+      (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col;
+  using namespace mxnet_op;
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  im2col_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+                             0, mshadow::Stream<gpu>::GetStream(s)>>>(
+      num_kernels, data_im, height, width, kernel_h, kernel_w, pad_h,
+      pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col,
+      width_col, data_col);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(im2col_gpu_kernel);
+}
+
+/*!
+ * \brief DO NOT call this directly. Use wrapper function col2im() instead;
+ */
+template <typename DType>
+__global__ void col2im_gpu_kernel(const int n, const DType* data_col,
+    const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int height_col, const int width_col,
+    DType* data_im, OpReqType req) {
+  CUDA_KERNEL_LOOP(index, n) {
+    DType val = 0;
+    const int w_im = index % width + pad_w;
+    const int h_im = (index / width) % height + pad_h;
+    const int c_im = index / (width * height);
+    int kernel_extent_w = (kernel_w - 1) * dilation_w + 1;
+    int kernel_extent_h = (kernel_h - 1) * dilation_h + 1;
+    // compute the start and end of the output
+    const int w_col_start =
+        (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1;
+    const int w_col_end = min(w_im / stride_w + 1, width_col);
+    const int h_col_start =
+        (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1;
+    const int h_col_end = min(h_im / stride_h + 1, height_col);
+    // TODO(caffe): use LCM of stride and dilation to avoid unnecessary loops
+    for (int h_col = h_col_start; h_col < h_col_end; h_col += 1) {
+      for (int w_col = w_col_start; w_col < w_col_end; w_col += 1) {
+        int h_k = (h_im - h_col * stride_h);
+        int w_k = (w_im - w_col * stride_w);
+        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
+          h_k /= dilation_h;
+          w_k /= dilation_w;
+          int data_col_index = (((c_im * kernel_h + h_k) * kernel_w + w_k) *
+                                height_col + h_col) * width_col + w_col;
+          val += data_col[data_col_index];
+        }
+      }
+    }
+    KERNEL_ASSIGN(data_im[index], req, val);
+  }
+}
+
+/*!
+ * \brief DO NOT call this directly. Use wrapper function col2im() instead;
+ */
+using mshadow::Shape;
+template <typename DType, int num_axes>
+__global__ void im2col_nd_gpu_kernel(const int n, const DType* data_im,
+    const Shape<num_axes+2> im_shape, const Shape<num_axes+1> col_shape,
+    const Shape<num_axes> kernel_shape, const Shape<num_axes> pad, const Shape<num_axes> stride,
+    const Shape<num_axes> dilation, DType* data_col) {
+  int d_temp[num_axes];  // NOLINT(runtime/arrays)
+  int d_iter[num_axes];  // NOLINT(runtime/arrays)
+
+  __shared__ int shared_dilation[num_axes];
+  __shared__ int shared_kernel_shape[num_axes];
+  __shared__ int shared_pad[num_axes];
+  __shared__ int shared_stride[num_axes];
+  __shared__ int shared_col_shape[num_axes + 1];
+  __shared__ int shared_im_shape[num_axes + 1];
+
+  if (threadIdx.x < num_axes) {
+    shared_dilation[threadIdx.x] = dilation[threadIdx.x];
+    shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
+    shared_pad[threadIdx.x] = pad[threadIdx.x];
+    shared_stride[threadIdx.x] = stride[threadIdx.x];
+  }
+  if (threadIdx.x < num_axes + 1) {
+    shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
+    shared_im_shape[threadIdx.x] = im_shape[threadIdx.x+1];  // skip batch dim
+  }
+  __syncthreads();
+
+  int i;
+  CUDA_KERNEL_LOOP(index, n) {
+    // Initialize channel_in, computed in the loop below, with intermediate
+    // computations used to compute the spatial indices.
+    int channel_in = index;
+    int channel_out = 1;
+    for (i = num_axes - 1; i >= 0; --i) {
+      d_temp[i] = channel_in % shared_col_shape[i + 1];
+      channel_in /= shared_col_shape[i + 1];
+      channel_out *= shared_kernel_shape[i];
+    }
+    channel_out *= channel_in;
+    int data_col_inc = 1;
+    for (i = 0; i < num_axes; ++i) {
+      channel_out *= shared_col_shape[i + 1];
+      channel_out += d_temp[i];
+      d_temp[i] = d_temp[i] * shared_stride[i] - shared_pad[i];
+      channel_in *= shared_im_shape[i + 1];
+      channel_in += d_temp[i];
+      data_col_inc *= shared_col_shape[i + 1];
+      d_iter[i] = 0;
+    }
+    DType* data_col_ptr = data_col + channel_out;
+    const DType* data_im_ptr = data_im + channel_in;
+    bool incremented;
+    do {
+      bool in_range = true;
+      for (i = 0; i < num_axes; ++i) {
+        const int d_iter_im = d_iter[i] * shared_dilation[i] + d_temp[i];
+        in_range &= d_iter_im >= 0 && d_iter_im < shared_im_shape[i + 1];
+        if (!in_range) { break; }
+      }
+      if (in_range) {
+        int data_im_offset = d_iter[0] * shared_dilation[0];
+        for (i = 1; i < num_axes; ++i) {
+          data_im_offset *= shared_im_shape[i + 1];
+          data_im_offset += d_iter[i] * shared_dilation[i];
+        }
+        *data_col_ptr = data_im_ptr[data_im_offset];
+      } else {
+        *data_col_ptr = 0;
+      }
+      data_col_ptr += data_col_inc;
+      incremented = false;
+      for (i = num_axes - 1; i >= 0; --i) {
+        const int d_max = shared_kernel_shape[i];
+        if (d_iter[i] == d_max - 1) {
+          d_iter[i] = 0;
+        } else {  // d_iter[i] < d_max - 1
+          ++d_iter[i];
+          incremented = true;
+          break;
+        }
+      }  // for (int i = num_axes - 1; i >= 0; --i)
+    } while (incremented);  // do
+  }  // CUDA_KERNEL_LOOP(index, n)
+}
+
+/*!\brief im2col gpu version
+ * \param s device stream
+ * \param data_im pointer of an image (C, H, W, ...) in the image batch
+ * \param col_shape column buffer shape (#channels, output_im_height, output_im_width, ...)
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param data_col column buffer pointer
+ */
+template <typename DType>
+inline void im2col(mshadow::Stream<gpu>* s,
+                   const DType* data_im, const TShape& im_shape,
+                   const TShape& col_shape, const TShape& kernel_shape,
+                   const TShape& pad, const TShape& stride,
+                   const TShape& dilation, DType* data_col) {
+  // num_axes should be smaller than block size
+  index_t num_spatial_axes = kernel_shape.ndim();
+  CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  index_t num_kernels = im_shape[1] * col_shape.ProdShape(1, col_shape.ndim());
+  using namespace mxnet_op;
+  switch (num_spatial_axes) {
+  case 1:
+    im2col_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+           0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        num_kernels, data_im, im_shape.get<3>(), col_shape.get<2>(),
+        kernel_shape.get<1>(), pad.get<1>(), stride.get<1>(), dilation.get<1>(), data_col);
+    break;
+  case 2:
+    im2col_gpu_kernel<DType> // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+           0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        num_kernels, data_im, im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1],
+        pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], 
+        col_shape[1], col_shape[2], data_col);
+    break;
+  case 3:
+    im2col_nd_gpu_kernel<DType, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+           0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        num_kernels, data_im, im_shape.get<5>(), col_shape.get<4>(),
+        kernel_shape.get<3>(), pad.get<3>(), stride.get<3>(), dilation.get<3>(), data_col);
+    break;
+  default:
+    LOG(FATAL) << "im2col_nd_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
+  MSHADOW_CUDA_POST_KERNEL_CHECK(im2col_nd_gpu_kernel);
+}
+
+/*!
+ * \brief DO NOT call this directly. Use wrapper function col2im() instead;
+ */
+template <typename DType>
+inline void col2im_gpu(mshadow::Stream<gpu>* s, const DType* data_col, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    DType* data_im, OpReqType req) {
+  int height_col = (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) /
+      stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) /
+      stride_w + 1;
+  int num_kernels = channels * height * width;
+  using namespace mxnet_op;
+  // To avoid involving atomic operations, we will launch one kernel per
+  // bottom dimension, and then in the kernel add up the top dimensions.
+  // NOLINT_NEXT_LINE(whitespace/operators)
+  col2im_gpu_kernel<DType><<<cuda_get_num_blocks(num_kernels), mshadow::cuda::kBaseThreadNum,
+                             0, mshadow::Stream<gpu>::GetStream(s)>>>(
+      num_kernels, data_col, height, width, channels, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+      height_col, width_col, data_im, req);
+  MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_gpu_kernel);
+}
+
+/*!
+ * \brief DO NOT call this directly. Use wrapper function col2im() instead;
+ */
+template <typename DType, int num_axes>
+__global__ void col2im_nd_gpu_kernel(const int n, const DType* data_col,
+    const Shape<num_axes+2> im_shape, const Shape<num_axes+1> col_shape,
+    const Shape<num_axes> kernel_shape, const Shape<num_axes> pad, const Shape<num_axes> stride,
+    const Shape<num_axes> dilation, DType* data_im, OpReqType req) {
+  int d_im[num_axes];  // NOLINT(runtime/arrays)
+  int d_col_iter[num_axes];  // NOLINT(runtime/arrays)
+  int d_col_start[num_axes];  // NOLINT(runtime/arrays)
+  int d_col_end[num_axes];  // NOLINT(runtime/arrays)
+
+  __shared__ int shared_dilation[num_axes];
+  __shared__ int shared_kernel_shape[num_axes];
+  __shared__ int shared_pad[num_axes];
+  __shared__ int shared_stride[num_axes];
+  __shared__ int shared_col_shape[num_axes + 1];
+  __shared__ int shared_im_shape[num_axes + 1];
+
+  if (threadIdx.x < num_axes) {
+    shared_dilation[threadIdx.x] = dilation[threadIdx.x];
+    shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x];
+    shared_pad[threadIdx.x] = pad[threadIdx.x];
+    shared_stride[threadIdx.x] = stride[threadIdx.x];
+  }
+  if (threadIdx.x < num_axes + 1) {
+    shared_col_shape[threadIdx.x] = col_shape[threadIdx.x];
+    shared_im_shape[threadIdx.x] = im_shape[threadIdx.x+1];  // skip batch dim
+  }
+  __syncthreads();
+
+  CUDA_KERNEL_LOOP(index, n) {
+    // Initialize channel_in, computed in the loop below, with intermediate
+    // computations used to compute the spatial indices.
+    int c_im = index;
+    // Calculate d_im (image dimensions).
+    for (int i = num_axes - 1; i >= 0; --i) {
+      d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i];
+      c_im /= shared_im_shape[i + 1];
+    }
+    // Calculate col start/end indices.
+    bool done = false;
+    for (int i = 0; i < num_axes; ++i) {
+      const int kernel_extent =
+          shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1;
+      d_col_start[i] = d_col_iter[i] =
+          (d_im[i] < kernel_extent) ? 0 :
+          (d_im[i] - kernel_extent) / shared_stride[i] + 1;
+      d_col_end[i] =
+          min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]);
+      if (d_col_start[i] >= d_col_end[i]) {
+        // Skip computation if the dimension is 0 at any spatial axis --
+        // final val will be 0.
+        data_im[index] = 0;
+        done = true;
+        break;  // for (int i = 0; i < num_axes; ++i)
+      }
+    }
+    if (done) {
+      continue;  // CUDA_KERNEL_LOOP(index, n)
+    }
+    // Loop over the col to compute the output val.
+    DType val = 0;
+    bool incremented = true;
+    bool skip = false;
+    do {
+      // Compute the final offset.
+      int final_offset = 0;
+      int kernel_shape_prod = 1;
+      int kernel_index;
+      for (int i = num_axes - 1; i >= 0; --i) {
+        kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i];
+        if (kernel_index % shared_dilation[i]) {
+          skip = true;
+          break;
+        } else {
+          kernel_index /= shared_dilation[i];
+          final_offset += kernel_index * kernel_shape_prod;
+          kernel_shape_prod *= shared_kernel_shape[i];
+        }
+      }
+      if (!skip) {
+        final_offset += kernel_shape_prod * c_im;
+        for (int i = 0; i < num_axes; ++i) {
+          final_offset *= shared_col_shape[i + 1];
+          final_offset += d_col_iter[i];
+        }
+        val += data_col[final_offset];
+      }
+      skip = false;
+      incremented = false;
+      for (int i = num_axes - 1; i >= 0; --i) {
+        const int d_max = d_col_end[i];
+        if (d_col_iter[i] == d_max - 1) {
+          d_col_iter[i] = d_col_start[i];
+        } else {  // d_col_iter[i] < d_max - 1
+          ++d_col_iter[i];
+          incremented = true;
+          break;  // for (int i = num_axes - 1; i >= 0; --i)
+        }
+      }  // for (int i = num_axes - 1; i >= 0; --i)
+    }  while (incremented);
+    KERNEL_ASSIGN(data_im[index], req, val);
+  }  // CUDA_KERNEL_LOOP(index, n)
+}
+
+/*!\brief
+ * gpu function of col2im algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param data_im pointer of a image (C, H, W,...) in the image batch
+ */
+template <typename DType>
+inline void col2im(mshadow::Stream<gpu>* s,
+                   const DType* data_col, const TShape& im_shape,
+                   const TShape& col_shape, const TShape& kernel_shape,
+                   const TShape& pad, const TShape& stride,
+                   const TShape& dilation, DType* data_im, OpReqType req) {
+  index_t num_spatial_axes = kernel_shape.ndim();
+  index_t im_size = im_shape.ProdShape(1, im_shape.ndim());
+  // num_axes should be smaller than block size
+  CHECK_LT(num_spatial_axes, mshadow::cuda::kBaseThreadNum);
+  using namespace mxnet_op;
+  switch (num_spatial_axes) {
+  case 1:
+    col2im_nd_gpu_kernel<DType, 1>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
+             0, mshadow::Stream<gpu>::GetStream(s)>>>(
+          im_size, data_col, im_shape.get<3>(), col_shape.get<2>(),
+          kernel_shape.get<1>(), pad.get<1>(), stride.get<1>(), dilation.get<1>(),
+          data_im, req);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_nd_gpu_kernel);
+    break;
+  case 2:
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    // NOLINT_NEXT_LINE(whitespace/operators)
+    col2im_gpu_kernel<DType><<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        im_size, data_col, im_shape[1], im_shape[2], im_shape[3],
+        kernel_shape[0], kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
+        dilation[0], dilation[1], col_shape[1], col_shape[2], data_im, req);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_gpu_kernel);
+    break;
+  case 3:
+    col2im_nd_gpu_kernel<DType, 3>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<cuda_get_num_blocks(im_size), mshadow::cuda::kBaseThreadNum,
+             0, mshadow::Stream<gpu>::GetStream(s)>>>(
+          im_size, data_col, im_shape.get<5>(), col_shape.get<4>(),
+          kernel_shape.get<3>(), pad.get<3>(), stride.get<3>(), dilation.get<3>(),
+          data_im, req);
+    MSHADOW_CUDA_POST_KERNEL_CHECK(col2im_nd_gpu_kernel);
+    break;
+  default:
+    LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+               << num_spatial_axes << " spatial axes";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_IM2COL_CUH_
diff --git a/src/operator/nn/im2col.h b/src/operator/nn/im2col.h
new file mode 100644
index 000000000000..998c8f73cd80
--- /dev/null
+++ b/src/operator/nn/im2col.h
@@ -0,0 +1,338 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ * 
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ * 
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ * 
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ * 
+ * LICENSE
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met: 
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer. 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution. 
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * CONTRIBUTION AGREEMENT
+ * 
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file im2col.h
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, and dilation.
+ * These functions are mainly used in convolution operators.
+ * The implementation of the im2col and col2im algorithms
+ * are copied from Caffe with minor interface modifications
+ * adapting to MXNet data structures.
+ */
+
+#ifndef MXNET_OPERATOR_NN_IM2COL_H_
+#define MXNET_OPERATOR_NN_IM2COL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <cstring>
+#include <vector>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+// Function uses casting from int to unsigned to compare if value of
+// parameter a is greater or equal to zero and lower than value of
+// parameter b. The b parameter is of type signed and is always positive,
+// therefore its value is always lower than 0x800... where casting
+// negative value of a parameter converts it to value higher than 0x800...
+// The casting allows to use one condition instead of two.
+inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
+  return static_cast<unsigned>(a) < static_cast<unsigned>(b);
+}
+
+/*!
+ * \brief im2col 2D cpu version.
+ * DO NOT call this function directly.
+ * Use the wrapper function im2col() instead.
+ */
+template <typename DType>
+inline void im2col_cpu(const DType* data_im, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    DType* data_col) {
+  const int output_h = (height + 2 * pad_h -
+    (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int output_w = (width + 2 * pad_w -
+    (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channel_size = height * width;
+  // TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
+  // for testing the performance of convolution operator,
+  // but the total runtime increased by 0.8s for images of shape
+  // (8, 32, 64, 64) and decreased by 0.2s for images of shape
+  // (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
+  // bottleneck of the convolution operator probably lies in dot().
+  // Hence, adding more threads to the loops contributes little
+  // toward improving the convolution operator's performance.
+  // We will revisit this issue in the future.
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_h + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            for (int output_cols = output_w; output_cols; output_cols--) {
+              *(data_col++) = 0;
+            }
+          } else {
+            int input_col = -pad_w + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                *(data_col++) = data_im[input_row * width + input_col];
+              } else {
+                *(data_col++) = 0;
+              }
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+/*!
+ * \brief core function of im2col algorithm. DO NOT call this function directly.
+ * Use wrapper function im2col() instead.
+ * \param data_input image pointer pointing the first element of channel dim
+ * \param im2col determine whether the algorithm is im2col or col2im
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param data_output start pointer of the column buffer to be filled
+ */
+template <typename DType>
+inline void im2col_nd_core_cpu(const DType* data_input, const bool im2col,
+    const TShape& im_shape, const TShape& col_shape,
+    const TShape& kernel_shape, const TShape& pad, const TShape& stride,
+    const TShape& dilation, DType* data_output, OpReqType req = mxnet::kWriteTo) {
+  if (mxnet::kNullOp == req) return;
+  index_t num_spatial_axes = kernel_shape.ndim();
+  if (!im2col) {
+    index_t im_size = im_shape[1];  // skip batch dim
+    for (index_t i = 0; i < num_spatial_axes; ++i) {
+      im_size *= im_shape[2 + i];
+    }
+    if (mxnet::kAddTo != req) {
+      std::fill(data_output, data_output+im_size, static_cast<DType>(0));
+    }
+  }
+  index_t kernel_size = 1;
+  for (index_t i = 0; i < num_spatial_axes; ++i) {
+    kernel_size *= kernel_shape[i];
+  }
+  const index_t channels_col = col_shape[0];
+  std::vector<index_t> d_offset(num_spatial_axes, 0);
+  std::vector<index_t> d_iter(num_spatial_axes, 0);
+  for (index_t c_col = 0; c_col < channels_col; ++c_col) {
+    // Loop over spatial axes in reverse order to compute a per-axis offset.
+    index_t offset = c_col;
+    for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
+      if (d_i < static_cast<int>(num_spatial_axes) - 1) {
+        offset /= kernel_shape[d_i + 1];
+      }
+      d_offset[d_i] = offset % kernel_shape[d_i];
+    }
+    for (bool incremented = true; incremented; ) {
+      // Loop over spatial axes in forward order to compute the indices in the
+      // image and column, and whether the index lies in the padding.
+      index_t index_col = c_col;
+      int index_im = c_col / kernel_size;
+      bool is_padding = false;
+      for (index_t d_i = 0; d_i < num_spatial_axes; ++d_i) {
+        const index_t d = d_iter[d_i];
+        const int d_im = static_cast<int>(d * stride[d_i] + d_offset[d_i] * dilation[d_i])
+          - static_cast<int>(pad[d_i]);
+        is_padding |= d_im < 0 || d_im >= static_cast<int>(im_shape[d_i + 2]);
+        index_col *= col_shape[d_i + 1];
+        index_col += d;
+        index_im *= static_cast<int>(im_shape[d_i + 2]);
+        index_im += d_im;
+      }
+      if (im2col) {
+        if (is_padding) {
+          data_output[index_col] = 0;
+        } else {
+          data_output[index_col] = data_input[index_im];
+        }
+      } else if (!is_padding) {  // col2im
+        data_output[index_im] += data_input[index_col];
+      }
+      // Loop over spatial axes in reverse order to choose an index,
+      // like counting.
+      incremented = false;
+      for (int d_i = static_cast<int>(num_spatial_axes) - 1; d_i >= 0; --d_i) {
+        const index_t d_max = col_shape[d_i + 1];
+        CHECK_LT(d_iter[d_i], d_max);
+        if (d_iter[d_i] + 1 == d_max) {
+          d_iter[d_i] = 0;
+        } else {  // d_iter[d_i] < d_max - 1
+          ++d_iter[d_i];
+          incremented = true;
+          break;
+        }
+      }
+    }  // while(incremented)
+  }  // for (int c = 0; c < channels_col; ++c)
+}
+
+/*!
+ * \brief cpu function of im2col algorithm
+ * \param data_im pointer of a image (C, H, W,...) in the image batch
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param data_col start pointer of the column buffer to be filled
+ */
+template <typename DType>
+inline void im2col(mshadow::Stream<cpu>* s,
+                   const DType* data_im, const TShape& im_shape,
+                   const TShape& col_shape, const TShape& kernel_shape,
+                   const TShape& pad, const TShape& stride,
+                   const TShape& dilation, DType* data_col) {
+  if (2 == kernel_shape.ndim()) {
+    im2col_cpu(data_im, im_shape[1], im_shape[2], im_shape[3],
+               kernel_shape[0], kernel_shape[1], pad[0], pad[1],
+               stride[0], stride[1], dilation[1], dilation[1], data_col);
+  } else {
+    im2col_nd_core_cpu(data_im, true, im_shape, col_shape,
+                       kernel_shape, pad, stride, dilation, data_col);
+  }
+}
+
+/*!
+ * \brief col2im 2D cpu version.
+ * DO NOT call this function direclty. Use wrapper function col2im() instead.
+ */
+template <typename DType>
+inline void col2im_cpu(const DType* data_col, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    DType* data_im, OpReqType req) {
+  if (mxnet::kNullOp == req) return;
+  if (mxnet::kAddTo != req) {
+    std::fill(data_im, data_im+height*width*channels, static_cast<DType>(0));
+  }
+  const int output_h = (height + 2 * pad_h -
+    (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int output_w = (width + 2 * pad_w -
+    (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  const int channel_size = height * width;
+  // TODO(junwu): we tested adding openmp (w/ & w/o collapse clause) here
+  // for testing the performance of convolution operator,
+  // but the total runtime increased by 0.8s for images of shape
+  // (8, 32, 64, 64) and decreased by 0.2s for images of shape
+  // (16, 64, 64, 64). Both kernel shapes are (8, 8). We think the
+  // bottleneck of the convolution operator probably lies in dot().
+  // Hence, adding more threads to the loops contributes little
+  // toward improving the convolution operator's performance.
+  // We will revisit this issue in the future.
+  for (int channel = channels; channel--; data_im += channel_size) {
+    for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+      for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+        int input_row = -pad_h + kernel_row * dilation_h;
+        for (int output_rows = output_h; output_rows; output_rows--) {
+          if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+            data_col += output_w;
+          } else {
+            int input_col = -pad_w + kernel_col * dilation_w;
+            for (int output_col = output_w; output_col; output_col--) {
+              if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+                data_im[input_row * width + input_col] += *data_col;
+              }
+              data_col++;
+              input_col += stride_w;
+            }
+          }
+          input_row += stride_h;
+        }
+      }
+    }
+  }
+}
+
+/*!\brief
+ * cpu function of col2im algorithm
+ * \param s device stream
+ * \param data_col start pointer of the column buffer to be filled
+ * \param im_shape input image shape in dimensions (N, C, H, W,)
+ * \param col_shape column buffer shape
+ * \param kernel_shape kernel filter shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param dilation dilation shape
+ * \param data_im pointer of a image (C, H, W,...) in the image batch
+ */
+template <typename DType>
+inline void col2im(mshadow::Stream<cpu>* s,
+                   const DType* data_col, const TShape& im_shape,
+                   const TShape& col_shape, const TShape& kernel_shape,
+                   const TShape& pad, const TShape& stride,
+                   const TShape& dilation, DType* data_im, OpReqType req) {
+  index_t num_spatial_axes = kernel_shape.ndim();
+  if (2 == num_spatial_axes) {
+    col2im_cpu(data_col, im_shape[1], im_shape[2], im_shape[3],
+               kernel_shape[0], kernel_shape[1], pad[0], pad[1],
+               stride[0], stride[1], dilation[0], dilation[1], data_im, req);
+  } else {
+    im2col_nd_core_cpu(data_col, false, im_shape, col_shape,
+                       kernel_shape, pad, stride, dilation, data_im, req);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#ifdef __CUDACC__
+#include "./im2col.cuh"
+#endif
+#endif  // MXNET_OPERATOR_NN_IM2COL_H_
diff --git a/src/operator/nn/pool.cuh b/src/operator/nn/pool.cuh
new file mode 100644
index 000000000000..54fd3461d80f
--- /dev/null
+++ b/src/operator/nn/pool.cuh
@@ -0,0 +1,826 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file pool.cuh
+ * \brief Function definitions of pooling 1/2/3-D images.
+ * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
+ * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cu
+ * \author Jun Wu
+ */
+
+#ifndef MXNET_OPERATOR_NN_POOL_CUH_
+#define MXNET_OPERATOR_NN_POOL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "../mxnet_op.h"
+#include "../../common/cuda_utils.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief max pooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_max_1d_gpu_kernel(const int nthreads, const DType* in_data,
+                                       const int channels, const int width,
+                                       const int pooled_width, const int kernel_w,
+                                       const int stride_w, const int pad_w,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    const int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * width;
+    DType max_val = MinValue<DType>();
+    for (int w = wstart; w < wend; ++w) {
+      const DType in_val = in_slice[w];
+      if (in_val > max_val) {
+        max_val = in_val;
+      }
+    }
+    out_data[index] = max_val;
+  }
+}
+
+/*!
+ * \brief max pooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_max_2d_gpu_kernel(const int nthreads, const DType* in_data,
+                                       const int channels, const int height, const int width,
+                                       const int pooled_height, const int pooled_width,
+                                       const int kernel_h, const int kernel_w, const int stride_h,
+                                       const int stride_w, const int pad_h, const int pad_w,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * height * width;
+    DType max_val = MinValue<DType>();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        const DType in_val = in_slice[h * width + w];
+        if (in_val > max_val) {
+          max_val = in_val;
+        }
+      }
+    }
+    out_data[index] = max_val;
+  }
+}
+
+/*!
+ * \brief max pooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_max_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int depth, const int height, const int width,
+                                       const int pooled_depth, const int pooled_height,
+                                       const int pooled_width, const int kernel_d,
+                                       const int kernel_h, const int kernel_w, const int stride_d,
+                                       const int stride_h, const int stride_w, const int pad_d,
+                                       const int pad_h, const int pad_w,
+                                       DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  // index is the output image's pixel index in NCDHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int dend = min(dstart + kernel_d, depth);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    const DType* in_slice =
+        in_data + (n * channels + c) * depth * height * width;
+    DType max_val = MinValue<DType>();
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          const DType in_val = in_slice[(d * height + h) * width + w];
+          if (in_val > max_val) {
+            max_val = in_val;
+          }
+        }
+      }
+    }
+    out_data[index] = max_val;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_sum_1d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int width, const int pooled_width, const int kernel_w,
+                                       const int stride_w, const int pad_w,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int c = (index / pooled_width) % channels;
+	  const int n = index / pooled_width / channels;
+	  int wstart = pw * stride_w - pad_w;
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (wend - wstart) : 1);
+	  wstart = max(wstart, 0);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * width;
+    for (int w = wstart; w < wend; ++w) {
+      sum += out_slice[w];
+    }
+    out_data[index] = sum / pool_size;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_sum_2d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int height, const int width,
+                                       const int pooled_height, const int pooled_width,
+                                       const int kernel_h, const int kernel_w,
+                                       const int stride_h, const int stride_w,
+                                       const int pad_h, const int pad_w,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int ph = (index / pooled_width) % pooled_height;
+	  const int c = (index / pooled_width / pooled_height) % channels;
+	  const int n = index / pooled_width / pooled_height / channels;
+	  int hstart = ph * stride_h - pad_h;
+	  int wstart = pw * stride_w - pad_w;
+	  int hend = min(hstart + kernel_h, height + pad_h);
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (hend - hstart) * (wend - wstart) : 1);
+	  hstart = max(hstart, 0);
+	  wstart = max(wstart, 0);
+	  hend = min(hend, height);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * height * width;
+	  for (int h = hstart; h < hend; ++h) {
+		  for (int w = wstart; w < wend; ++w) {
+		    sum += out_slice[h * width + w];
+		  }
+	  }
+    out_data[index] = sum / pool_size;
+  }
+}
+
+/*!
+ * \brief avg/sum pooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template <typename DType>
+__global__ void pool_sum_3d_gpu_kernel(const int nthreads, const DType* in_data, const int channels,
+                                       const int depth, const int height, const int width,
+                                       const int pooled_depth, const int pooled_height,
+                                       const int pooled_width, const int kernel_d,
+                                       const int kernel_h, const int kernel_w,
+                                       const int stride_d, const int stride_h, const int stride_w,
+                                       const int pad_d, const int pad_h, const int pad_w,
+                                       DType* out_data, bool getAvg = false) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  const int pw = index % pooled_width;
+	  const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+	  const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+	  const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+	  int hstart = ph * stride_h - pad_h;
+	  int wstart = pw * stride_w - pad_w;
+    int dend = min(dstart + kernel_d, depth + pad_d);
+	  int hend = min(hstart + kernel_h, height + pad_h);
+	  int wend = min(wstart + kernel_w, width + pad_w);
+	  const int pool_size = (getAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+    dstart = max(dstart, 0);
+	  hstart = max(hstart, 0);
+	  wstart = max(wstart, 0);
+    dend = min(dend, depth);
+	  hend = min(hend, height);
+	  wend = min(wend, width);
+	  DType sum = 0;
+	  const DType* out_slice =
+	 		in_data + (n * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          sum += out_slice[(d * height + h) * width + w];
+        }
+      }
+    }
+    out_data[index] = sum / pool_size;
+  }
+}
+
+/*!
+ * \brief max unpooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template <typename DType>
+__global__ void unpool_max_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int width,
+                                         const int pooled_width, const int kernel_w,
+                                         const int stride_w, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int c = (index / pooled_width) % channels;
+    const int n = index / pooled_width / channels;
+    int wstart = pw * stride_w - pad_w;
+    const int wend = min(wstart + kernel_w, width);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    for (int w = wstart; w < wend; ++w) {
+      if (in_data_slice[w] == max_val) {
+        max_idx = w;
+        break;
+      }
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template <typename DType>
+__global__ void unpool_max_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_h, const int kernel_w,
+                                         const int stride_h, const int stride_w,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * height * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    bool found = false;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        const int idx = h * width + w;
+        if (in_data_slice[idx] == max_val) {
+          max_idx = idx;
+          found = true;
+          break;
+        }
+      }
+      if (found) break;
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template <typename DType>
+__global__ void unpool_max_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const DType* in_data, const DType* out_data,
+                                         const int channels, const int depth, const int height,
+                                         const int width, const int pooled_depth,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_d, const int kernel_h,
+                                         const int kernel_w, const int stride_d,
+                                         const int stride_h, const int stride_w, const int pad_d,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad) {
+  // index is the output image's pixel index in NCDHW
+  // the order has to be consistent with pooling max
+  // to avoid adding out_grad to the wrong in_grad
+  // in the case where there are multiple max pixels
+  // covered by a kernel window
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int pd = (index / pooled_width / pooled_height) % pooled_depth;
+    const int c = (index / pooled_width / pooled_height / pooled_depth) % channels;
+    const int n = index / pooled_width / pooled_height / pooled_depth / channels;
+    int dstart = pd * stride_d - pad_d;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int dend = min(dstart + kernel_d, depth);
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    // in data/grad offset batch and channel dims
+    int in_offset = (n * channels + c) * depth * height * width;
+    const DType* in_data_slice = in_data + in_offset;
+    int max_idx = -1;
+    DType max_val = out_data[index];
+    bool found = false;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          const int idx = (d * height + h) * width + w;
+          if (in_data_slice[idx] == max_val) {
+            max_idx = idx;
+            found = true;
+            break;
+          }
+        }
+        if (found) break;
+      }
+      if (found) break;
+    }
+
+    // In the case where pad > 0 and kernel = 1, for example,
+    // max_idx can be -1 reaching this step.
+    if (max_idx >= 0) {
+      atomicAdd(&in_grad[in_offset+max_idx], out_grad[index]);
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling gpu kernel for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+__global__ void unpool_sum_1d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int width,
+                                         const int pooled_width, const int kernel_w,
+                                         const int stride_w, const int pad_w,
+                                         DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int c = (index / width) % channels;
+	  const int n = index / width / channels;
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_width;
+    for (int pw = pwstart; pw < pwend; ++pw) {
+      // figure out the pooling size
+      int wstart = pw * stride_w - pad_w;
+      int wend = min(wstart + kernel_w, width + pad_w);
+      int pool_size = (isAvg? (wend - wstart) : 1);
+      gradient += out_grad_slice[pw] / pool_size;
+    }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling gpu kernel for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+__global__ void unpool_sum_2d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int height, const int width,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_h, const int kernel_w,
+                                         const int stride_h, const int stride_w,
+                                         const int pad_h, const int pad_w,
+                                         DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int h = (index / width) % height + pad_h;
+	  const int c = (index / width / height) % channels;
+	  const int n = index / width / height / channels;
+	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	  const int phend = min(h / stride_h + 1, pooled_height);
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_height * pooled_width;
+	  for (int ph = phstart; ph < phend; ++ph) {
+	 	  for (int pw = pwstart; pw < pwend; ++pw) {
+		    // figure out the pooling size
+			  int hstart = ph * stride_h - pad_h;
+			  int wstart = pw * stride_w - pad_w;
+			  int hend = min(hstart + kernel_h, height + pad_h);
+			  int wend = min(wstart + kernel_w, width + pad_w);
+			  int pool_size = (isAvg? (hend - hstart) * (wend - wstart) : 1);
+			  gradient += out_grad_slice[ph * pooled_width + pw] / pool_size;
+		  }
+	  }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling gpu kernel for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+__global__ void unpool_sum_3d_gpu_kernel(const int nthreads, const DType* out_grad,
+                                         const int channels, const int depth, const int height,
+                                         const int width, const int pooled_depth,
+                                         const int pooled_height, const int pooled_width,
+                                         const int kernel_d, const int kernel_h,
+                                         const int kernel_w, const int stride_d, const int stride_h,
+                                         const int stride_w, const int pad_d, const int pad_h,
+                                         const int pad_w, DType* in_grad, bool isAvg = false) {
+  // index is the input image index in NCDHW
+  CUDA_KERNEL_LOOP(index, nthreads) {
+	  // find out the local index
+	  // find out the local offset
+	  const int w = index % width + pad_w;
+	  const int h = (index / width) % height + pad_h;
+    const int d = (index / width / height) % depth + pad_d;
+	  const int c = (index / width / height / depth) % channels;
+	  const int n = index / width / height / depth / channels;
+    const int pdstart = (d < kernel_d) ? 0 : (d - kernel_d) / stride_d + 1;
+    const int pdend = min(d / stride_d + 1, pooled_depth);
+	  const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
+	  const int phend = min(h / stride_h + 1, pooled_height);
+	  const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
+	  const int pwend = min(w / stride_w + 1, pooled_width);
+	  DType gradient = 0;
+	  const DType* out_grad_slice =
+      out_grad + (n * channels + c) * pooled_depth * pooled_height * pooled_width;
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_d - pad_d;
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int dend = min(dstart + kernel_d, depth + pad_d);
+          int hend = min(hstart + kernel_h, height + pad_h);
+          int wend = min(wstart + kernel_w, width + pad_w);
+          int pool_size = (isAvg? (dend - dstart) * (hend - hstart) * (wend - wstart) : 1);
+          gradient += out_grad_slice[(pd * pooled_height + ph) * pooled_width + pw] / pool_size;
+        }
+      }
+    }
+    // if req=kWriteTo, in_grad has already been assigned zero values in unpool()
+    // use "+=" here instead of "=" to accommodate when req=kAddTo
+	  in_grad[index] += gradient;
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type, only support kWriteTo for now
+ * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ */
+template<typename DType>
+inline void pool(mshadow::Stream<gpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data) {
+  CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
+  using namespace mxnet_op;
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2],
+                                   oshape[2], kernel[0], stride[0], pad[0], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_1d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], oshape[2],
+                                   kernel[0], stride[0], pad[0], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_1d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   oshape[2], oshape[3], kernel[0], kernel[1],
+                                   stride[0], stride[1], pad[0], pad[1], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_2d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4],
+                                   kernel[0], kernel[1], kernel[2], stride[0],
+                                   stride[1], stride[2], pad[0], pad[1], pad[2], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_max_3d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], out_data, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      pool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                               0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                   oshape.Size(), in_data, ishape[1], ishape[2], ishape[3],
+                                   ishape[4], oshape[2], oshape[3], oshape[4], kernel[0],
+                                   kernel[1], kernel[2], stride[0], stride[1], stride[2],
+                                   pad[0], pad[1], pad[2], out_data);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(pool_sum_3d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is gpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ */
+template<typename DType>
+inline void unpool(mshadow::Stream<gpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad) {
+  if (mxnet::kNullOp == req_type) return;
+  if (mxnet::kAddTo != req_type) {
+    mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, ishape.Size(), in_grad);
+  }
+  using namespace mxnet_op;
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_1d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], oshape[2], kernel[0], stride[0], pad[0],
+                                     in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_1d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_1d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], oshape[2], kernel[0],
+                                     stride[0], pad[0], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_1d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else  if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_2d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_2d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_2d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3],
+                                     oshape[2], oshape[3], kernel[0], kernel[1],
+                                     stride[0], stride[1], pad[0], pad[1], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_2d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_max_3d_gpu_kernel<<<cuda_get_num_blocks(oshape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     oshape.Size(), out_grad, in_data, out_data,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2],
+                                     pad[0], pad[1], pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_max_3d_gpu_kernel);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad, true);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      unpool_sum_3d_gpu_kernel<<<cuda_get_num_blocks(ishape.Size()), mshadow::cuda::kBaseThreadNum,
+                                 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+                                     ishape.Size(), out_grad,
+                                     ishape[1], ishape[2], ishape[3], ishape[4],
+                                     oshape[2], oshape[3], oshape[4], kernel[0], kernel[1],
+                                     kernel[2], stride[0], stride[1], stride[2], pad[0], pad[1],
+                                     pad[2], in_grad);
+      MSHADOW_CUDA_POST_KERNEL_CHECK(unpool_sum_3d_gpu_kernel);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_POOL_CUH_
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
new file mode 100644
index 000000000000..79accb5d521f
--- /dev/null
+++ b/src/operator/nn/pool.h
@@ -0,0 +1,759 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2017 by Contributors
+ * \file pool.h
+ * \brief Function definitions of pooling 1/2/3-D images.
+ * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
+ * \ref https://github.com/BVLC/caffe/blob/master/src/caffe/layers/pooling_layer.cpp
+ * \author Jun Wu
+ */
+
+#ifndef MXNET_OPERATOR_NN_POOL_H_
+#define MXNET_OPERATOR_NN_POOL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace pool_enum {
+enum PoolingOpInputs {kData};
+enum PoolingOpOutputs {kOut, kMask};
+enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingOpPadConventionType {kValid, kFull};
+}  // namespace pool_enum
+
+/*!
+ * \brief max pooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_data_offset = ishape[2];
+  const index_t out_data_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width);
+        wstart = std::max(wstart, 0);
+        DType max_val = MinValue<DType>();
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w] > max_val) {
+            max_val = in_data[w];
+          }
+        }
+        out_data[pw] = max_val;
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max pooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          DType max_val = MinValue<DType>();
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int in_index = h * width + w;
+              if (in_data[in_index] > max_val) {
+                max_val = in_data[in_index];
+              }
+            }
+          }
+          out_data[pool_index] = max_val;
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max pooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_max_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data) {
+  using mshadow::red::limits::MinValue;
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth);
+            int hend = std::min(hstart + kernel_h, height);
+            int wend = std::min(wstart + kernel_w, width);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            DType max_val = MinValue<DType>();
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int in_index = (d * height + h) * width + w;
+                  if (in_data[in_index] > max_val) {
+                    max_val = in_data[in_index];
+                  }
+                }
+              }
+            }
+            out_data[pool_index] = max_val;
+          }
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_sum_1d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data, bool getAvg = false) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_data_offset = ishape[2];
+  const index_t out_data_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = (wend - wstart);
+        wstart = std::max(wstart, 0);
+        wend = std::min(wend, width);
+        DType sum = 0;
+        for (int w = wstart; w < wend; ++w) {
+          sum += in_data[w];
+        }
+        out_data[pw] = (getAvg? sum/pool_size : sum);
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_sum_2d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data, bool getAvg = false) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_data_offset = ishape[2] * ishape[3];
+  const index_t out_data_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = (hend - hstart) * (wend - wstart);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          DType sum = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              sum += in_data[h*width+w];
+            }
+          }
+          out_data[ph*pooled_width+pw] = (getAvg? sum/pool_size : sum);
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum pooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface pool().
+ */
+template<typename DType>
+inline void pool_sum_3d_cpu(const DType* in_data, const TShape& ishape, const TShape& oshape,
+                            const TShape& kernel, const TShape& pad, const TShape& stride,
+                            DType* out_data, bool getAvg = false) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_data_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_data_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth + pad_d);
+            int hend = std::min(hstart + kernel_h, height + pad_h);
+            int wend = std::min(wstart + kernel_w, width + pad_w);
+            int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, depth);
+            hend = std::min(hend, height);
+            wend = std::min(wend, width);
+            DType sum = 0;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  sum += in_data[(d*height+h)*width+w];
+                }
+              }
+            }
+            out_data[(pd*pooled_height+ph)*pooled_width+pw] = (getAvg? sum/pool_size : sum);
+          }
+        }
+      }
+      in_data += in_data_offset;
+      out_data += out_data_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_1d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_offset = ishape[2];
+  const index_t out_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width);
+        wstart = std::max(wstart, 0);
+        int max_idx = -1;
+        for (int w = wstart; w < wend; ++w) {
+          if (in_data[w] == out_data[pw]) {
+            max_idx = w;
+            break;
+          }
+        }
+        // In the case where pad > 0 and kernel = 1, for example,
+        // max_idx can be -1 reaching this step.
+        if (max_idx >= 0) {
+          in_grad[max_idx] += out_grad[pw];
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_2d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_offset = ishape[2] * ishape[3];
+  const index_t out_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height);
+          int wend = std::min(wstart + kernel_w, width);
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          const int pool_index = ph * pooled_width + pw;
+          int max_idx = -1;
+          bool found = false;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              const int idx = h * width + w;
+              if (in_data[idx] == out_data[pool_index]) {
+                max_idx = idx;
+                found = true;
+                break;
+              }
+            }
+            if (found) break;
+          }
+          // In the case where pad > 0 and kernel = 1, for example,
+          // max_idx can be -1 reaching this step.
+          if (max_idx >= 0) {
+            in_grad[max_idx] += out_grad[pool_index];
+          }
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+/*!
+ * \brief max unpooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_max_3d_cpu(const DType* out_grad, const DType* in_data,
+                              const DType* out_data, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth);
+            int hend = std::min(hstart + kernel_h, height);
+            int wend = std::min(wstart + kernel_w, width);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            int max_idx = -1;
+            bool found = false;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  const int idx = (d * height + h) * width + w;
+                  if (in_data[idx] == out_data[pool_index]) {
+                    max_idx = idx;
+                    found = true;
+                    break;
+                  }
+                }
+                if (found) break;
+              }
+              if (found) break;
+            }
+            // In the case where pad > 0 and kernel = 1, for example,
+            // max_idx can be -1 reaching this step.
+            if (max_idx >= 0) {
+              in_grad[max_idx] += out_grad[pool_index];
+            }
+          }
+        }
+      }
+      in_data += in_offset;
+      in_grad += in_offset;
+      out_data += out_offset;
+      out_grad += out_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 1-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_sum_1d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int width = ishape[2];
+  const int pooled_width = oshape[2];
+  const int kernel_w = kernel[0];
+  const int pad_w = pad[0];
+  const int stride_w = stride[0];
+  const index_t in_grad_offset = ishape[2];
+  const index_t out_grad_offset = oshape[2];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pw = 0; pw < pooled_width; ++pw) {
+        int wstart = pw * stride_w - pad_w;
+        int wend = std::min(wstart + kernel_w, width + pad_w);
+        int pool_size = 1;
+        if (isAvg) {
+          pool_size = wend - wstart;
+        }
+        wstart = std::max(wstart, 0);
+        wend = std::min(wend, width);
+        for (int w = wstart; w < wend; ++w) {
+          in_grad[w] += out_grad[pw] / pool_size;
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 2-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_sum_2d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int height = ishape[2], width = ishape[3];
+  const int pooled_height = oshape[2], pooled_width = oshape[3];
+  const int kernel_h = kernel[0], kernel_w = kernel[1];
+  const int pad_h = pad[0], pad_w = pad[1];
+  const int stride_h = stride[0], stride_w = stride[1];
+  const index_t in_grad_offset = ishape[2] * ishape[3];
+  const index_t out_grad_offset = oshape[2] * oshape[3];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int ph = 0; ph < pooled_height; ++ph) {
+        for (int pw = 0; pw < pooled_width; ++pw) {
+          int hstart = ph * stride_h - pad_h;
+          int wstart = pw * stride_w - pad_w;
+          int hend = std::min(hstart + kernel_h, height + pad_h);
+          int wend = std::min(wstart + kernel_w, width + pad_w);
+          int pool_size = 1;
+          if (isAvg) {
+            pool_size = (hend - hstart) * (wend - wstart);
+          }
+          hstart = std::max(hstart, 0);
+          wstart = std::max(wstart, 0);
+          hend = std::min(hend, height);
+          wend = std::min(wend, width);
+          const int pool_index = ph * pooled_width + pw;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              in_grad[h*width+w] += out_grad[pool_index] / pool_size;
+            }
+          }
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+/*!
+ * \brief avg/sum unpooling cpu function for 3-D images.
+ * Do not call this kernel directly. Use the interface unpool().
+ */
+template<typename DType>
+inline void unpool_sum_3d_cpu(const DType* out_grad, const TShape& ishape,
+                              const TShape& oshape, const TShape& kernel,
+                              const TShape& pad, const TShape& stride,
+                              DType* in_grad, bool isAvg = false) {
+  const int depth = ishape[2], height = ishape[3], width = ishape[4];
+  const int pooled_depth = oshape[2], pooled_height = oshape[3], pooled_width = oshape[4];
+  const int kernel_d = kernel[0], kernel_h = kernel[1], kernel_w = kernel[2];
+  const int pad_d = pad[0], pad_h = pad[1], pad_w = pad[2];
+  const int stride_d = stride[0], stride_h = stride[1], stride_w = stride[2];
+  const index_t in_grad_offset = ishape[2] * ishape[3] * ishape[4];
+  const index_t out_grad_offset = oshape[2] * oshape[3] * oshape[4];
+  for (index_t n = 0; n < oshape[0]; ++n) {
+    for (index_t c = 0; c < oshape[1]; ++c) {
+      for (int pd = 0; pd < pooled_depth; ++pd) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            int dstart = pd * stride_d - pad_d;
+            int hstart = ph * stride_h - pad_h;
+            int wstart = pw * stride_w - pad_w;
+            int dend = std::min(dstart + kernel_d, depth + pad_d);
+            int hend = std::min(hstart + kernel_h, height + pad_h);
+            int wend = std::min(wstart + kernel_w, width + pad_w);
+            int pool_size = 1;
+            if (isAvg) {
+              pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            }
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, depth);
+            hend = std::min(hend, height);
+            wend = std::min(wend, width);
+            const int pool_index = (pd * pooled_height + ph) * pooled_width + pw;
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  in_grad[(d*height+h)*width+w] += out_grad[pool_index] / pool_size;
+                }
+              }
+            }
+          }
+        }
+      }
+      in_grad += in_grad_offset;
+      out_grad += out_grad_offset;
+    }
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D pooling operations.
+ * \param s context stream defining the device in use is cpu
+ * \param in_data pointer of the input tensor data in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type, only support kWriteTo for now
+ * \param out_data pointer of the output tensor data in the format of NCW, NCHW, or NCDHW
+ */
+template<typename DType>
+inline void pool(mshadow::Stream<cpu>* s, const DType* in_data, const TShape& ishape,
+                 const TShape& oshape, const TShape& kernel, const TShape& pad,
+                 const TShape& stride, const int pool_type, OpReqType req_type,
+                 DType* out_data) {
+  CHECK_EQ(req_type, kWriteTo) << "Only support req=kWriteTo in pooling operations";
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_1d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_2d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      pool_max_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      pool_sum_3d_cpu(in_data, ishape, oshape, kernel, pad, stride, out_data);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D pooling";
+  }
+}
+
+/*!
+ * \brief This function serves as an interface for 1/2/3-D unpooling operations.
+ * \param s context stream defining the device in use is cpu
+ * \param out_grad pointer of the gradient of operator's output tensor
+ * \param in_data pointer of the input tensor in the format of NCW, NCHW, or NCDHW
+ * \param out_data pointer of the output tensor in the format of NCW, NCHW, or NCDHW
+ * \param ishape input tensor shape
+ * \param oshape output tensor shape
+ * \param kernel kernel shape
+ * \param pad pad shape
+ * \param stride stride shape
+ * \param pool_type supported pooling type: max, avg, sum
+ * \param req_type operator request type: kNullOp, kNullWriteInplace, kNullWriteTo, kNullAddTo
+ * \param in_grad pointer of the gradient of the operator's input tensor
+ */
+template<typename DType>
+inline void unpool(mshadow::Stream<cpu>* s, const DType* out_grad, const DType* in_data,
+                   const DType* out_data, const TShape& ishape, const TShape& oshape,
+                   const TShape& kernel, const TShape& pad, const TShape& stride,
+                   const int pool_type, OpReqType req_type, DType* in_grad) {
+  if (mxnet::kNullOp == req_type) return;
+  if (mxnet::kAddTo != req_type) {
+    mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, ishape.Size(), in_grad);
+  }
+  if (kernel.ndim() == 1) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_1d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_1d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 2) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_2d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_2d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else if (kernel.ndim() == 3) {
+    if (pool_enum::kMaxPooling == pool_type) {
+      unpool_max_3d_cpu(out_grad, in_data, out_data, ishape, oshape, kernel, pad, stride, in_grad);
+    } else if (pool_enum::kAvgPooling == pool_type) {
+      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad, true);
+    } else if (pool_enum::kSumPooling == pool_type) {
+      unpool_sum_3d_cpu(out_grad, ishape, oshape, kernel, pad, stride, in_grad);
+    } else {
+      LOG(FATAL) << "Unknown pooling type " << pool_type;
+    }
+  } else {
+    LOG(FATAL) << "Unsupported " << kernel.ndim() << "-D unpooling";
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+#ifdef __CUDACC__
+#include "./pool.cuh"
+#endif
+
+#endif  // MXNET_OPERATOR_NN_POOL_H_
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
new file mode 100644
index 000000000000..cdb6427602c3
--- /dev/null
+++ b/src/operator/nn/softmax-inl.h
@@ -0,0 +1,267 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file softmax-inl.h
+ * \brief
+*/
+#ifndef MXNET_OPERATOR_NN_SOFTMAX_INL_H_
+#define MXNET_OPERATOR_NN_SOFTMAX_INL_H_
+
+#include <vector>
+
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../tensor/broadcast_reduce_op.h"
+
+namespace mxnet {
+namespace op {
+namespace mxnet_op {
+
+struct softmax_fwd {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(expf(a)/b);
+  }
+};
+
+
+struct log_softmax_fwd {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType a, DType b) {
+    return DType(a - logf(b));
+  }
+};
+
+
+template<typename OP, typename DType, int ndim>
+inline void Softmax(Stream<cpu> *s, DType *in, DType *out,
+                    Shape<ndim> shape, int axis) {
+  index_t M = shape[axis];
+  index_t N = shape.Size()/M;
+  Shape<ndim> stride = calc_stride(shape);
+  Shape<ndim> sshape = shape;
+  sshape[axis] = 1;
+  index_t sa = stride[axis];
+
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+    index_t base = unravel_dot(i, sshape, stride);
+
+    DType mmax = in[base];
+    for (index_t j = 1; j < M; ++j) {
+      if (mmax < in[base + j*sa]) mmax = in[base + j*sa];
+    }
+
+    DType sum = DType(0);
+    for (index_t j = 0; j < M; ++j) {
+      sum += std::exp(in[base + j*sa] - mmax);
+    }
+
+    for (index_t j = 0; j < M; ++j) {
+      out[base + j*sa] = OP::Map(in[base + j*sa] - mmax, sum);
+    }
+  }
+}
+
+
+struct softmax_bwd {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
+    return DType(out * (ograd - sum));
+  }
+};
+
+
+struct log_softmax_bwd {
+  template<typename DType>
+  MSHADOW_XINLINE static DType Map(DType ograd, DType out, DType sum) {
+    return DType(ograd - expf(out)*sum);
+  }
+};
+
+
+template<typename OP1, typename OP2, typename DType, int ndim>
+inline void SoftmaxGrad(Stream<cpu> *s, DType *out, DType *ograd,
+                        DType *igrad, Shape<ndim> shape, int axis) {
+  index_t M = shape[axis];
+  index_t N = shape.Size()/M;
+  Shape<ndim> stride = calc_stride(shape);
+  Shape<ndim> sshape = shape;
+  sshape[axis] = 1;
+  index_t sa = stride[axis];
+
+  #pragma omp parallel for
+  for (int i = 0; i < N; ++i) {
+    index_t base = unravel_dot(i, sshape, stride);
+
+    DType sum = DType(0);
+    for (index_t j = 0; j < M; ++j) {
+      sum += OP1::Map(ograd[base + j*sa], out[base + j*sa]);
+    }
+
+    for (index_t j = 0; j < M; ++j) {
+      igrad[base + j*sa] = OP2::Map(ograd[base + j*sa], out[base + j*sa], sum);
+    }
+  }
+}
+
+
+#ifdef __CUDACC__
+template<int x_bits, typename OP, typename DType, int ndim>
+__global__ void softmax_compute_kernel(DType *in, DType *out, index_t M, int axis,
+                                       Shape<ndim> sshape, Shape<ndim> stride) {
+  const unsigned x_size = 1 << x_bits;
+  __shared__ DType smem[x_size];
+  index_t sa = stride[axis];
+  index_t base = unravel_dot(blockIdx.x, sshape, stride);
+  index_t x = threadIdx.x;
+
+  red::maximum::SetInitValue(smem[x]);
+  for (index_t i = x; i < M; i += x_size) {
+    red::maximum::Reduce(smem[x], in[base + i*sa]);
+  }
+  __syncthreads();
+  cuda::Reduce1D<red::maximum, x_bits>(smem);
+  __syncthreads();
+  DType smax = smem[0];
+  __syncthreads();
+
+  red::sum::SetInitValue(smem[x]);
+  for (index_t i = x; i < M; i += x_size) {
+    red::sum::Reduce(smem[x], static_cast<DType>(expf(in[base + i*sa] - smax)));
+  }
+  __syncthreads();
+  cuda::Reduce1D<red::sum, x_bits>(smem);
+  __syncthreads();
+  DType ssum = smem[0];
+  __syncthreads();
+
+  for (index_t i = x; i < M; i += x_size) {
+    out[base + i*sa] = OP::Map(in[base + i*sa] - smax, ssum);
+  }
+}
+
+template<typename OP, typename DType, int ndim>
+inline void Softmax(Stream<gpu> *s, DType *in, DType *out,
+                    Shape<ndim> shape, int axis) {
+  const int x_bits = 7;
+  const int x_size = 1 << x_bits;
+  index_t M = shape[axis];
+  index_t N = shape.Size()/M;
+  Shape<ndim> stride = calc_stride(shape);
+  Shape<ndim> sshape = shape;
+  sshape[axis] = 1;
+
+  softmax_compute_kernel<x_bits, OP, DType, ndim>
+    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+      in, out, M, axis, sshape, stride);
+}
+
+
+template<int x_bits, typename OP1, typename OP2, typename DType, int ndim>
+__global__ void softmax_gradient_kernel(DType *out, DType *ograd, DType *igrad,
+                                        index_t M, int axis, Shape<ndim> sshape,
+                                        Shape<ndim> stride) {
+  const unsigned x_size = 1 << x_bits;
+  __shared__ DType smem[x_size];
+  index_t sa = stride[axis];
+  index_t base = unravel_dot(blockIdx.x, sshape, stride);
+  index_t x = threadIdx.x;
+
+  red::sum::SetInitValue(smem[x]);
+  for (index_t i = x; i < M; i += x_size) {
+    red::sum::Reduce(smem[x], OP1::Map(ograd[base + i*sa], out[base + i*sa]));
+  }
+  __syncthreads();
+  cuda::Reduce1D<red::sum, x_bits>(smem);
+  __syncthreads();
+  DType ssum = smem[0];
+  __syncthreads();
+
+  for (index_t i = x; i < M; i += x_size) {
+    igrad[base + i*sa] = OP2::Map(ograd[base + i*sa], out[base + i*sa], ssum);
+  }
+}
+
+
+template<typename OP1, typename OP2, typename DType, int ndim>
+inline void SoftmaxGrad(Stream<gpu> *s, DType *out, DType *ograd,
+                        DType *igrad, Shape<ndim> shape, int axis) {
+  const int x_bits = 7;
+  const int x_size = 1 << x_bits;
+  index_t M = shape[axis];
+  index_t N = shape.Size()/M;
+  Shape<ndim> stride = calc_stride(shape);
+  Shape<ndim> sshape = shape;
+  sshape[axis] = 1;
+
+  softmax_gradient_kernel<x_bits, OP1, OP2, DType, ndim>
+    <<<N, x_size, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+      out, ograd, igrad, M, axis, sshape, stride);
+}
+#endif
+
+}  // namespace mxnet_op
+
+
+struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
+  int axis;
+  DMLC_DECLARE_PARAMETER(SoftmaxParam) {
+    DMLC_DECLARE_FIELD(axis).set_default(-1)
+      .describe("The axis along which to compute softmax. "
+                "By default use the last axis");
+  }
+};
+
+template<typename xpu, typename OP>
+void SoftmaxCompute(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  if (req[0] == kNullOp) return;
+  CHECK_NE(req[0], kAddTo);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis = CheckAxis(param.axis, inputs[0].ndim());
+  TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (shape.ndim() == 2) {
+      Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+              outputs[0].dptr<DType>(), shape.get<2>(), axis);
+    } else {
+      Softmax<OP>(ctx.get_stream<xpu>(), inputs[0].dptr<DType>(),
+              outputs[0].dptr<DType>(), shape.get<3>(), axis);
+    }
+  });
+}
+
+
+template<typename xpu, typename OP1, typename OP2>
+void SoftmaxGradCompute(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  if (req[0] == kNullOp) return;
+  CHECK_NE(req[0], kAddTo);
+  const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
+  int axis = CheckAxis(param.axis, inputs[0].ndim());
+  TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, true);
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    if (shape.ndim() == 2) {
+      SoftmaxGrad<OP1, OP2>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
+                            inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
+                            shape.get<2>(), axis);
+    } else {
+      SoftmaxGrad<OP1, OP2>(ctx.get_stream<xpu>(), inputs[1].dptr<DType>(),
+                            inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
+                            shape.get<3>(), axis);
+    }
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_SOFTMAX_INL_H_
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
new file mode 100644
index 000000000000..e7ca118bf75d
--- /dev/null
+++ b/src/operator/nn/softmax.cc
@@ -0,0 +1,37 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file softmax.cc
+ * \brief CPU Implementation of softmax
+ */
+#include "./softmax-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+#include "../tensor/elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(SoftmaxParam);
+
+MXNET_OPERATOR_REGISTER_UNARY(softmax)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::softmax_fwd>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_softmax"})
+.add_arguments(SoftmaxParam::__FIELDS__());
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_softmax)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, mshadow::op::mul,
+                                                        mxnet_op::softmax_bwd>);
+
+MXNET_OPERATOR_REGISTER_UNARY(log_softmax)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxCompute<cpu, mxnet_op::log_softmax_fwd>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_log_softmax"})
+.add_arguments(SoftmaxParam::__FIELDS__());
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_log_softmax)
+.set_attr_parser(ParamParser<SoftmaxParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxGradCompute<cpu, mshadow_op::left,
+                                                        mxnet_op::log_softmax_bwd>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nn/softmax.cu b/src/operator/nn/softmax.cu
new file mode 100644
index 000000000000..570f5bf15c88
--- /dev/null
+++ b/src/operator/nn/softmax.cu
@@ -0,0 +1,27 @@
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file softmax.cc
+ * \brief CPU Implementation of softmax
+ */
+#include "./softmax-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(softmax)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::softmax_fwd>);
+
+NNVM_REGISTER_OP(_backward_softmax)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, mshadow::op::mul,
+                                                        mxnet_op::softmax_bwd>);
+
+NNVM_REGISTER_OP(log_softmax)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxCompute<gpu, mxnet_op::log_softmax_fwd>);
+
+NNVM_REGISTER_OP(_backward_log_softmax)
+.set_attr<FCompute>("FCompute<gpu>", SoftmaxGradCompute<gpu, mshadow_op::left,
+                                                        mxnet_op::log_softmax_bwd>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/nnpack/nnpack_pooling-inl.h b/src/operator/nnpack/nnpack_pooling-inl.h
index ac841394ca37..0df070de812c 100644
--- a/src/operator/nnpack/nnpack_pooling-inl.h
+++ b/src/operator/nnpack/nnpack_pooling-inl.h
@@ -22,14 +22,14 @@
 namespace mxnet {
 namespace op {
 
-template <typename xpu, typename Reducer, typename DType>
-class NNPACKPoolingOp : public PoolingOp<xpu, Reducer, DType> {
+template <typename xpu, typename DType>
+class NNPACKPoolingOp : public PoolingOp<xpu, DType> {
  private:
   PoolingParam param_;
 
  public:
   explicit NNPACKPoolingOp(PoolingParam p)
-      : PoolingOp<xpu, Reducer, DType>(p) {
+      : PoolingOp<xpu, DType>(p) {
     this->param_ = p;
   }
 
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 2945a2676f25..862c39b3b9b4 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -170,27 +170,43 @@ inline bool type_assign(int *y, const int& x) {
   }
 #endif
 
-// describe op registration point
-// TODO(eric): move to dmlc-core
-#define STRINGIZE_DETAIL(x) #x
-#define STRINGIZE(x) STRINGIZE_DETAIL(x)
-#define MXNET_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" STRINGIZE(__LINE__))
 
-// quick helper to make node
-inline std::vector<nnvm::NodeEntry> MakeGradNode(
-    const char* op_name,
-    const nnvm::NodePtr& n,
-    std::vector<nnvm::NodeEntry> inputs,
-    std::unordered_map<std::string, std::string> dict) {
-  nnvm::NodePtr p = nnvm::Node::Create();
+// make a new node with operator op_name. Inputs are not filled.
+inline nnvm::NodePtr MakeNode(
+    const char* op_name, const std::string& name,
+    std::vector<nnvm::NodeEntry> const * inputs,
+    std::unordered_map<std::string, std::string> const * dict,
+    nnvm::NodePtr const * fwd_node) {
+  auto p = nnvm::Node::Create();
   p->attrs.op = nnvm::Op::Get(op_name);
-  p->attrs.name = n->attrs.name + "_backward";
-  p->attrs.dict = std::move(dict);
+  p->attrs.name = name;
+  if (dict != nullptr) p->attrs.dict = *dict;
+  if (inputs != nullptr) p->inputs = *inputs;
+  if (fwd_node != nullptr) {
+    p->control_deps.emplace_back(*fwd_node);
+  }
   if (p->op()->attr_parser != nullptr) {
     p->op()->attr_parser(&(p->attrs));
   }
-  p->control_deps.emplace_back(n);
-  p->inputs = std::move(inputs);
+  return p;
+}
+
+inline nnvm::NodePtr MakeNode(
+    const char* op_name, const std::string& name,
+    const std::vector<nnvm::NodeEntry>& inputs,
+    std::unordered_map<std::string, std::string> const * dict,
+    nnvm::NodePtr const * fwd_node) {
+  return MakeNode(op_name, name, &inputs, dict, fwd_node);
+}
+
+
+// quick helper to make node
+inline std::vector<nnvm::NodeEntry> MakeGradNode(
+    const char* op_name, const nnvm::NodePtr& n,
+    const std::vector<nnvm::NodeEntry>& inputs,
+    const std::unordered_map<std::string, std::string>& dict) {
+  auto p = MakeNode(op_name, n->attrs.name + "_backward",
+                    &inputs, &dict, &n);
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < p->num_outputs(); ++i) {
     ret.emplace_back(nnvm::NodeEntry{p, i, 0});
@@ -204,22 +220,50 @@ inline std::vector<nnvm::NodeEntry> MakeZeroGradNodes(
     const std::vector<nnvm::NodeEntry>& ograds) {
   std::vector<nnvm::NodeEntry> ret;
   for (index_t i = 0; i < n->num_inputs(); ++i) {
-    nnvm::NodePtr p = nnvm::Node::Create();
-    p->attrs.op = nnvm::Op::Get("_zeros");
     std::ostringstream os;
     if (1 == n->num_inputs()) {
       os << n->attrs.name << "_backward";
     } else {
       os << n->attrs.name << "_in" << i << "_backward";
     }
-    p->attrs.name = os.str();
-    p->attrs.dict = std::unordered_map<std::string, std::string>();
-    p->control_deps.emplace_back(n);
+    auto p = MakeNode("zeros_like", os.str(), {n->inputs[i]}, nullptr, &n);
     ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
   }
   return ret;
 }
 
+
+// check whether all output grads are zero.
+inline bool CheckGradAllZero(const std::vector<nnvm::NodeEntry>& ograds) {
+  const auto zero_op = nnvm::Op::Get("_zeros");
+  const auto zero_like_op = nnvm::Op::Get("zeros_like");
+  if (!ograds.size()) return false;
+  for (const auto& grad : ograds) {
+    if (!grad.node) return false;
+    if (grad.node->op() != zero_op && grad.node->op() != zero_like_op ) return false;
+  }
+  return true;
+}
+
+// make gradient node that doesn't add to objective.
+// i.e. igrads are always zero when ograds are zero.
+inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
+    const char* op_name, const nnvm::NodePtr& n,
+    const std::vector<nnvm::NodeEntry>& ograds,
+    const std::vector<nnvm::NodeEntry>& inputs,
+    const std::unordered_map<std::string, std::string> dict) {
+  if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
+  auto p = MakeNode(op_name, n->attrs.name + "_backward",
+                    nullptr, &dict, &n);
+  p->inputs.insert(p->inputs.end(), ograds.begin(), ograds.end());
+  p->inputs.insert(p->inputs.end(), inputs.begin(), inputs.end());
+  std::vector<nnvm::NodeEntry> ret;
+  for (index_t i = 0; i < p->num_outputs(); ++i) {
+    ret.emplace_back(nnvm::NodeEntry{p, i, 0});
+  }
+  return ret;
+}
+
 /*! \brief Parse keyword arguments as PType arguments and save to parsed */
 template<typename PType>
 inline void ParamParser(nnvm::NodeAttrs* attrs) {
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index d7b3cbfd9c80..fe18db7ec431 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -179,29 +179,30 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
     Tensor<xpu, 2, DType> mean = inputs[2].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> var = inputs[3].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+
+    grad = scalar<DType>(param.rescale_grad) * grad +
+      scalar<DType>(param.wd) * weight;
+
     if (param.clip_gradient >= 0.0f) {
       mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) *
-          F<clip>(scalar<DType>(param.rescale_grad)*grad,
-                  DType(param.clip_gradient));
+          F<clip>(grad, DType(param.clip_gradient));
       var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2)*F<square>(
-          F<clip>(scalar<DType>(param.rescale_grad)*grad,
-                  DType(param.clip_gradient)));
+          F<clip>(grad, DType(param.clip_gradient)));
     } else {
-      mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) *
-          scalar<DType>(param.rescale_grad) * grad;
-      var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2) *
-          F<square>(scalar<DType>(param.rescale_grad)*grad);
+      mean = scalar<DType>(param.beta1)*mean + scalar<DType>(1.f-param.beta1) * grad;
+      var = scalar<DType>(param.beta2)*var + scalar<DType>(1.f-param.beta2) * F<square>(grad);
     }
     Assign(out, req[0],
-           scalar<DType>(1.f-param.lr*param.wd)*weight
-             - scalar<DType>(param.lr)*mean/(F<square_root>(var)+scalar<DType>(param.epsilon)));
+           weight -
+           scalar<DType>(param.lr) * mean /
+           (F<square_root>(var) + scalar<DType>(param.epsilon)));
   });
 }
 
 // This RMSProp code follows the version in
 // http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45)
 // by Alex Graves, 2013.
-struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
+struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   float lr;
   float gamma1;
   float gamma2;
@@ -209,32 +210,39 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   float wd;
   float rescale_grad;
   float clip_gradient;
-  DMLC_DECLARE_PARAMETER(RMSPropParam) {
+  float clip_weights;
+  DMLC_DECLARE_PARAMETER(RMSPropAlexParam) {
     DMLC_DECLARE_FIELD(lr).describe("learning_rate");
     DMLC_DECLARE_FIELD(gamma1).set_default(0.95f).describe("gamma1");
     DMLC_DECLARE_FIELD(gamma2).set_default(0.9f).describe("gamma2");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f).describe("epsilon");
     DMLC_DECLARE_FIELD(wd).set_default(0.0f).describe("weight decay");
     DMLC_DECLARE_FIELD(rescale_grad)
-        .set_default(1.0f)
-        .describe("rescale gradient as grad = rescale_grad*grad.");
+    .set_default(1.0f)
+    .describe("rescale gradient as grad = rescale_grad*grad.");
     DMLC_DECLARE_FIELD(clip_gradient)
-        .set_default(-1.0f)
-        .describe("If greater than 0, clip gradient to "
-                  "grad = max(min(grad, -clip_gradient), clip_gradient). "
-                  "Otherwise turned off.");
+    .set_default(-1.0f)
+    .describe("If greater than 0, clip gradient to "
+              "grad = max(min(grad, -clip_gradient), clip_gradient). "
+              "Otherwise turned off.");
+    DMLC_DECLARE_FIELD(clip_weights)
+      .set_default(-1.0f)
+      .describe("If greater than 0, clip weights to "
+                "weights = max(min(weights, -clip_weights), clip_weights). "
+                "Otherwise turned off.");
   }
 };
 
 template <typename xpu>
-inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
-                          const std::vector<TBlob> &inputs,
-                          const std::vector<OpReqType> &req,
-                          const std::vector<TBlob> &outputs) {
+inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<OpReqType> &req,
+                              const std::vector<TBlob> &outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   using namespace mshadow_op;
-  const RMSPropParam &param = nnvm::get<RMSPropParam>(attrs.parsed);
+  const RMSPropAlexParam &param = nnvm::get<RMSPropAlexParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
@@ -244,40 +252,129 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     Tensor<xpu, 2, DType> delta = inputs[4].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
 
+    grad = scalar<DType>(param.rescale_grad) * grad +
+           scalar<DType>(param.wd) * weight;
+
     if (param.clip_gradient >= 0.0f) {
       state_n = scalar<DType>(1.f - param.gamma1) *
-                    F<clip>(scalar<DType>(param.rescale_grad) * grad,
-                            DType(param.clip_gradient)) *
-                    F<clip>(scalar<DType>(param.rescale_grad) * grad,
-                            DType(param.clip_gradient)) +
+                    F<clip>(grad, DType(param.clip_gradient)) *
+                    F<clip>(grad, DType(param.clip_gradient)) +
                 scalar<DType>(param.gamma1) * state_n;
       state_g = scalar<DType>(1.f - param.gamma1) *
-                    F<clip>(scalar<DType>(param.rescale_grad) * grad,
-                            DType(param.clip_gradient)) +
+                    F<clip>(grad, DType(param.clip_gradient)) +
                 scalar<DType>(param.gamma1) * state_g;
       delta = scalar<DType>(param.gamma2) * delta -
               scalar<DType>(param.lr) *
-                  ((F<clip>(scalar<DType>(param.rescale_grad) * grad,
-                            DType(param.clip_gradient)) /
-                       (F<square_root>(state_n - state_g * state_g) +
-                        scalar<DType>(param.epsilon))) +
-                   scalar<DType>(param.wd) * weight);
+                  (F<clip>(grad, DType(param.clip_gradient)) /
+                   (F<square_root>(state_n - state_g * state_g) +
+                    scalar<DType>(param.epsilon)));
     } else {
-      state_n = scalar<DType>((1.f - param.gamma1) *
-                              param.rescale_grad * param.rescale_grad) *
-                    (grad * grad) +
+      state_n = scalar<DType>(1.f - param.gamma1) * (grad * grad) +
                 scalar<DType>(param.gamma1) * state_n;
-      state_g =
-          scalar<DType>((1.f - param.gamma1) * param.rescale_grad) * grad +
-          scalar<DType>(param.gamma1) * state_g;
+      state_g = scalar<DType>(1.f - param.gamma1) * grad +
+                scalar<DType>(param.gamma1) * state_g;
       delta = scalar<DType>(param.gamma2) * delta -
               scalar<DType>(param.lr) *
-                  ((scalar<DType>(param.rescale_grad) * grad /
-                       (F<square_root>(state_n - state_g * state_g) +
-                        scalar<DType>(param.epsilon))) +
-                   scalar<DType>(param.wd) * weight);
+                  (grad / (F<square_root>(state_n - state_g * state_g) +
+                           scalar<DType>(param.epsilon)));
+    }
+
+    if (param.clip_weights >= 0.0f) {
+      Assign(out, req[0], F<clip>(weight + delta, DType(param.clip_weights)));
+    } else {
+      Assign(out, req[0], weight + delta);
+    }
+  });
+}
+
+// This RMSProp code follows the version in
+// http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+// by Tieleman & Hinton, 2012
+struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
+  float lr;
+  float gamma1;
+  float epsilon;
+  float wd;
+  float rescale_grad;
+  float clip_gradient;
+  float clip_weights;
+  DMLC_DECLARE_PARAMETER(RMSPropParam) {
+    DMLC_DECLARE_FIELD(lr).describe("learning_rate");
+    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f).describe("gamma1");
+    DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f).describe("epsilon");
+    DMLC_DECLARE_FIELD(wd).set_default(0.0f).describe("weight decay");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("rescale gradient as grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("If greater than 0, clip gradient to "
+              "grad = max(min(grad, -clip_gradient), clip_gradient). "
+              "Otherwise turned off.");
+    DMLC_DECLARE_FIELD(clip_weights)
+      .set_default(-1.0f)
+      .describe("If greater than 0, clip weights to "
+                "weights = max(min(weights, -clip_weights), clip_weights). "
+                "Otherwise turned off.");
+  }
+};
+
+template <typename xpu>
+inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
+                          const std::vector<TBlob> &inputs,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  const RMSPropParam &param = nnvm::get<RMSPropParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> state_n = inputs[2].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+
+    grad = scalar<DType>(param.rescale_grad) * grad +
+           scalar<DType>(param.wd) * weight;
+
+    if (param.clip_gradient >= 0.0f) {
+      state_n = scalar<DType>(1.f - param.gamma1) *
+                    F<clip>(grad, DType(param.clip_gradient)) *
+                    F<clip>(grad, DType(param.clip_gradient)) +
+                scalar<DType>(param.gamma1) * state_n;
+      if (param.clip_weights >= 0.0f) {
+        Assign(out, req[0],
+               F<clip>(weight -
+                           scalar<DType>(param.lr) *
+                               (F<clip>(grad, DType(param.clip_gradient)) /
+                                (F<square_root>(state_n) +
+                                 scalar<DType>(param.epsilon))),
+                       DType(param.clip_weights)));
+      } else {
+        Assign(out, req[0], weight -
+                                scalar<DType>(param.lr) *
+                                    (F<clip>(grad, DType(param.clip_gradient)) /
+                                     (F<square_root>(state_n) +
+                                      scalar<DType>(param.epsilon))));
+      }
+    } else {
+      state_n = scalar<DType>(1.f - param.gamma1) * (grad * grad) +
+                scalar<DType>(param.gamma1) * state_n;
+      if (param.clip_weights >= 0.0f) {
+        Assign(out, req[0],
+               F<clip>(weight -
+                           scalar<DType>(param.lr) *
+                               (grad / (F<square_root>(state_n) +
+                                        scalar<DType>(param.epsilon))),
+                       DType(param.clip_weights)));
+      } else {
+        Assign(out, req[0], weight -
+                                scalar<DType>(param.lr) *
+                                    (grad / (F<square_root>(state_n) +
+                                             scalar<DType>(param.epsilon))));
+      }
     }
-    Assign(out, req[0], weight + delta);
   });
 }
 
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 0503bdbb093f..ad708724ba91 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -13,6 +13,7 @@ DMLC_REGISTER_PARAMETER(SGDParam);
 DMLC_REGISTER_PARAMETER(SGDMomParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
+DMLC_REGISTER_PARAMETER(RMSPropAlexParam);
 
 NNVM_REGISTER_OP(sgd_update)
 .describe("Updater function for sgd optimizer")
@@ -55,18 +56,35 @@ NNVM_REGISTER_OP(adam_update)
 NNVM_REGISTER_OP(rmsprop_update)
 .describe("Updater function for RMSProp optimizer."
           " The RMSProp code follows the version in"
+          " http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf "
+          "Tieleman & Hinton, 2012.")
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<RMSPropParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs &attrs) {
+    return std::vector<uint32_t>{2};
+  })
+.set_attr<FCompute>("FCompute<cpu>", RMSPropUpdate<cpu>)
+.add_arguments(RMSPropParam::__FIELDS__());
+
+NNVM_REGISTER_OP(rmspropalex_update)
+.describe("Updater function for RMSPropAlex optimizer."
+          " The RMSPropAlex code follows the version in"
           " http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.")
 .set_num_inputs(5)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<RMSPropParam>)
+.set_attr_parser(ParamParser<RMSPropAlexParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<5, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<5, 1>)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
     return std::vector<uint32_t>{2, 3, 4};
   })
-.set_attr<FCompute>("FCompute<cpu>", RMSPropUpdate<cpu>)
-.add_arguments(RMSPropParam::__FIELDS__());
+.set_attr<FCompute>("FCompute<cpu>", RMSPropAlexUpdate<cpu>)
+.add_arguments(RMSPropAlexParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index c081aea66127..2b2667ec317b 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -21,5 +21,8 @@ NNVM_REGISTER_OP(adam_update)
 NNVM_REGISTER_OP(rmsprop_update)
 .set_attr<FCompute>("FCompute<gpu>", RMSPropUpdate<gpu>);
 
+NNVM_REGISTER_OP(rmspropalex_update)
+.set_attr<FCompute>("FCompute<gpu>", RMSPropAlexUpdate<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/pad-inl.h b/src/operator/pad-inl.h
index fdabd9e42753..dc15ed786ae5 100644
--- a/src/operator/pad-inl.h
+++ b/src/operator/pad-inl.h
@@ -67,8 +67,8 @@ class PadOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     // Get any size input + output into required form
     int rank = in_data[pad_enum::kData].ndim();
@@ -104,8 +104,8 @@ class PadOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     // Get any size input + output into required form
     auto pad = param_.pad_width;
@@ -158,7 +158,7 @@ class PadProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) << "Can only be one input to symbol.";
+    CHECK_EQ(in_shape->size(), 1U) << "Can only be one input to symbol.";
 
     const TShape &dshape = (*in_shape)[pad_enum::kData];
     if (dshape.ndim() == 0) return false;
diff --git a/src/operator/pad.cc b/src/operator/pad.cc
index 6fefb3259758..5cecd866a4d4 100644
--- a/src/operator/pad.cc
+++ b/src/operator/pad.cc
@@ -329,7 +329,7 @@ void single_image_constant_grad(const Tensor<cpu, 4, DType> &in_grad,
   const int pad_t = pad[6];
   const int pad_l = pad[8];
   int c, d, w, h;
-#pragma omp parallel for private(c, w, h)
+  #pragma omp parallel for private(c, d, w, h)
   for (c = 0; c < in_grad.size(0); ++c) {
     for (d = 0; d < in_grad.size(1); ++d) {
       for (h = 0; h < in_grad.size(2); ++h) {
@@ -400,12 +400,15 @@ Operator *PadProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
 DMLC_REGISTER_PARAMETER(PadParam);
 
 MXNET_REGISTER_OP_PROPERTY(Pad, PadProp)
-    .describe(
-        "Pads an n-dimensional input tensor. Allows for precise control of the "
-        "padding type and how much padding to apply on both sides of a given "
-        "dimension.")
-    .add_argument("data", "Symbol", "An n-dimensional input tensor.")
-    .add_arguments(PadParam::__FIELDS__());
+.describe(R"code(Pad an array.
+
+Only supports 4-D and 5-D input array.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "An n-dimensional input tensor.")
+.add_arguments(PadParam::__FIELDS__());
+
+NNVM_REGISTER_OP(Pad).add_alias("pad");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index e3511fc4ca25..8156c3796539 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 
 #ifndef MXNET_OPERATOR_POOLING_INL_H_
@@ -17,17 +17,11 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./nn/pool.h"
 
 namespace mxnet {
 namespace op {
 
-namespace pool_enum {
-enum PoolingOpInputs {kData};
-enum PoolingOpOutputs {kOut};
-enum PoolingOpType {kMaxPooling, kAvgPooling, kSumPooling};
-enum PoolingOpPadConventionType {kValid, kFull};
-}  // namespace pool_enum
-
 struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   TShape kernel;
   TShape stride;
@@ -35,10 +29,13 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
   int pool_type;
   int pooling_convention;
   bool global_pool;
+  bool cudnn_off;
   DMLC_DECLARE_PARAMETER(PoolingParam) {
     DMLC_DECLARE_FIELD(global_pool).set_default(false)
-    .describe("Ignore kernel size, do global pooling based on current input feature map. "
-              "This is useful for input with different shape");
+    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
+
+    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
+    .describe("Turn off cudnn pooling and use MXNet pooling operator. ");
 
     DMLC_DECLARE_FIELD(kernel)
     .enforce_nonzero()
@@ -53,122 +50,77 @@ struct PoolingParam : public dmlc::Parameter<PoolingParam> {
     DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_enum::kValid)
     .add_enum("full", pool_enum::kFull)
     .add_enum("valid", pool_enum::kValid)
-    .describe("Pooling convention to be applied."
-              "kValid is default setting of Mxnet and rounds down the output pooling size."
-              "kFull is compatible with Caffe and rounds up the output pooling size.");
+    .describe("Pooling convention to be applied.");
 
-    int stride_shape[] = {1, 1};
-    DMLC_DECLARE_FIELD(stride).set_default(TShape(stride_shape, stride_shape + 2))
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
     .enforce_nonzero()
     .describe("stride: for pooling (y, x) or (d, y, x)");
 
-    int pad_shape[] = {0, 0};
-    DMLC_DECLARE_FIELD(pad).set_default(TShape(pad_shape, pad_shape + 2))
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
     .describe("pad for pooling: (y, x) or (d, y, x)");
   }
 };
 
-template<typename xpu, typename Reducer, typename DType>
+template<typename xpu, typename DType>
 class PoolingOp : public Operator {
  public:
   explicit PoolingOp(PoolingParam p) {
     this->param_ = p;
   }
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
+  virtual void Forward(const OpContext& ctx,
+                       const std::vector<TBlob>& in_data,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& out_data,
+                       const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> out = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
-    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
-      Assign(out,
-             req[pool_enum::kOut],
-             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    } else if (param_.pool_type == pool_enum::kAvgPooling) {
-      Assign(out,
-             req[pool_enum::kOut],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    }
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
+
+    pool(s, in_data[pool_enum::kData].dptr<DType>(),
+         in_data[pool_enum::kData].shape_,
+         out_data[pool_enum::kOut].shape_,
+         param_.global_pool?
+           TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+           : param_.kernel,
+         param_.pad,
+         param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+         param_.pool_type,
+         req[pool_enum::kOut],
+         out_data[pool_enum::kOut].dptr<DType>());
   }
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
+  virtual void Backward(const OpContext& ctx,
+                        const std::vector<TBlob>& out_grad,
+                        const std::vector<TBlob>& in_data,
+                        const std::vector<TBlob>& out_data,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& in_grad,
+                        const std::vector<TBlob>& aux_args) {
     using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
-    // TODO(bing): remove pad (0,0)
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(in_grad.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = out_grad[pool_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> data = in_data[pool_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> output_data = out_data[pool_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> input_grad = in_grad[pool_enum::kData].get<xpu, 4, DType>(s);
-
-    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
-
-    if (param_.pool_type == pool_enum::kMaxPooling || param_.pool_type == pool_enum::kSumPooling) {
-      Assign(input_grad, req[pool_enum::kData],
-             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  param_.pad[0],
-                  param_.pad[1]));
-    } else if (param_.pool_type == pool_enum::kAvgPooling) {
-      Assign(input_grad, req[pool_enum::kData],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  param_.pad[0],
-                  param_.pad[1]));
-    }
+    const TShape& ishape = in_data[pool_enum::kData].shape_;
+
+    unpool(s, out_grad[pool_enum::kOut].dptr<DType>(),
+           in_data[pool_enum::kData].dptr<DType>(),
+           out_data[pool_enum::kOut].dptr<DType>(),
+           in_grad[pool_enum::kData].shape_,
+           out_grad[pool_enum::kOut].shape_,
+           param_.global_pool?
+             TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim())
+             : param_.kernel,
+           param_.pad,
+           param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride,
+           param_.pool_type,
+           req[pool_enum::kData],
+           in_grad[pool_enum::kData].dptr<DType>());
   }
 
  private:
@@ -183,7 +135,23 @@ Operator* CreateOp(PoolingParam param, int dtype);
 class PoolingProp : public OperatorProperty {
  public:
   void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
     param_.Init(kwargs);
+    if (param_.kernel.ndim() == 1) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
+    } else if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported";
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    }
+    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+      << "stride and kernel should have the same length";
+    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+      << "pad and kernel should have the same length";
   }
 
   std::map<std::string, std::string> GetParams() const override {
@@ -193,14 +161,34 @@ class PoolingProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
-    CHECK_EQ(in_shape->size(), 1);
+    CHECK_EQ(in_shape->size(), 1U);
     const TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
-                               << "Or 5D in (batch, channel, d, y, x)";
+    CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be  3D in (batch, channel, x)"
+                                << " Or 4D in (batch, channel, y, x) "
+                                << " Or 5D in (batch, channel, d, y, x)";
     TShape oshape = dshape;
     if (dshape.ndim() ==  0) return false;
-    if (param_.kernel.ndim() == 2) {
-      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+    if (param_.kernel.ndim() == 1) {
+      CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        if (param_.pooling_convention == pool_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);  // save output shape
+    } else if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
       if (param_.global_pool) {
         oshape[2] = 1;
         oshape[3] = 1;
@@ -226,9 +214,10 @@ class PoolingProp : public OperatorProperty {
         }
       }
       out_shape->clear();
-      out_shape->push_back(oshape);
+      out_shape->push_back(oshape);  // save output shape
     } else if (param_.kernel.ndim() == 3) {
-      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK_EQ(dshape.ndim(), 5U)
+        << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
       CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
       CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
       CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
@@ -237,7 +226,7 @@ class PoolingProp : public OperatorProperty {
         oshape[3] = 1;
         oshape[4] = 1;
       } else {
-        if (param_.pool_type == pool_enum::kValid) {
+        if (param_.pooling_convention == pool_enum::kValid) {
           oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
                               param_.stride[0];
           oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
@@ -258,7 +247,7 @@ class PoolingProp : public OperatorProperty {
       }
 
       out_shape->clear();
-      out_shape->push_back(oshape);
+      out_shape->push_back(oshape);  // save output shape
     }
     return true;
   }
@@ -266,7 +255,7 @@ class PoolingProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1);
+    CHECK_EQ(in_type->size(), 1U);
     int dtype = (*in_type)[0];
 
     if (dtype == -1) {
@@ -293,7 +282,8 @@ class PoolingProp : public OperatorProperty {
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], out_data[pool_enum::kOut]};
+    return {out_grad[pool_enum::kOut], in_data[pool_enum::kData],
+            out_data[pool_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index d2cec2d55f2e..dd069ae53cc7 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 #include "./pooling-inl.h"
 #if MXNET_USE_MKL2017 == 1
@@ -20,9 +20,15 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
+  // TODO(junwu): Since MKL has a bug when pad and stride > 0,
+  // we disable MKL in those cases and will re-enable it after
+  // it is fixed by deleting lines 28 and 29.
 #if MXNET_USE_MKL2017 == 1
-    if ((param.pool_type == pool_enum::kMaxPooling)
-      || (param.pool_type == pool_enum::kAvgPooling)) {
+    if (param.kernel.ndim() == 2
+      && 0 == param.pad[0] && 0 == param.pad[1]
+      && 0 == param.stride[0] && 0 == param.stride[1]
+      && ((param.pool_type == pool_enum::kMaxPooling)
+      || (param.pool_type == pool_enum::kAvgPooling))) {
       switch (dtype) {
       case mshadow::kFloat32:
         return new MKLPoolingOp<cpu, float>(param);
@@ -44,28 +50,22 @@ Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
     && (param.stride[0] == 2) && (param.stride[1] == 2)) {
     switch (dtype) {
     case mshadow::kFloat32:
-      return new NNPACKPoolingOp<cpu, mshadow::red::maximum, float>(param);
+      return new NNPACKPoolingOp<cpu, float>(param);
     default:
       break;
     }
   }
 #endif
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new PoolingOp<cpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        op = new PoolingOp<cpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<cpu, DType>(param);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
+      return NULL;
     }
-  })
+  });
 
   return op;
 }
@@ -83,8 +83,46 @@ Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_sha
 DMLC_REGISTER_PARAMETER(PoolingParam);
 
 MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp)
-.describe("Perform spatial pooling on inputs.")
-.add_argument("data", "Symbol", "Input data to the pooling operator.")
+.describe(R"code(Perform pooling on the input.
+
+The shapes for 1-D pooling are
+
+- **data**: *(batch_size, channel, width)*,
+- **out**: *(batch_size, num_filter, out_width)*.
+
+The shapes for 2-D pooling are
+
+- **data**: *(batch_size, channel, height, width)*
+- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+
+    out_height = f(height, kernel[0], pad[0], stride[0])
+    out_width = f(width, kernel[1], pad[1], stride[1])
+
+The defintion of *f* depends on ``pooling_convention``, which has two options:
+
+- **valid** (default)::
+
+    f(x, k, p, s) = floor(x+2*p-k)/s+1
+
+- **full**, which is compatible with Caffe::
+
+    f(x, k, p, s) = ceil(x+2*p-k)/s+1
+
+But ``global_pool`` is set to be true, then do a global pooling, namely reset
+``kernel=(height, width)``.
+
+Three pooling options are supported by ``pool_type``:
+
+- **avg**: average pooling
+- **max**: max pooling
+- **sum**: sum pooling
+
+For 3-D pooling, an additional *depth* dimension is added before
+*height*. Namely the input data will have shape *(batch_size, channel, depth,
+height, width)*.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to the pooling operator.")
 .add_arguments(PoolingParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index be2464e3c0ef..c420852b1c8d 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -1,8 +1,8 @@
 /*!
- * Copyright (c) 2015 by Contributors
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Jun Wu
 */
 #include <vector>
 #include "./pooling-inl.h"
@@ -12,45 +12,37 @@
 
 namespace mxnet {
 namespace op {
+
 template<>
 Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
-  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new CuDNNPoolingOp<DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new CuDNNPoolingOp<DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        LOG(WARNING) << "Sum pooling is not supported by cudnn, MxNet sum pooling is applied.";
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (!param.cudnn_off) {
+      switch (param.pool_type) {
+        case pool_enum::kMaxPooling:
+          return new CuDNNPoolingOp<DType>(param);
+        case pool_enum::kAvgPooling:
+          return new CuDNNPoolingOp<DType>(param);
+        case pool_enum::kSumPooling:
+          LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
+          return new PoolingOp<gpu, DType>(param);
+        default:
+          LOG(FATAL) << "unknown pooling type";
+          return NULL;
+      }
     }
   });
-#else
+#endif  // MXNET_USE_CUDNN
+  Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_enum::kMaxPooling:
-        op = new PoolingOp<gpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_enum::kAvgPooling:
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_enum::kSumPooling:
-        op = new PoolingOp<gpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return NULL;
+    if (pool_enum::kMaxPooling == param.pool_type
+        || pool_enum::kAvgPooling == param.pool_type
+        || pool_enum::kSumPooling == param.pool_type) {
+      op = new PoolingOp<gpu, DType>(param);
+    } else {
+      LOG(FATAL) << "unknown pooling type";
     }
   });
-#endif  // MXNET_USE_CUDNN
   return op;
 }
 
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
new file mode 100644
index 000000000000..7fef1258093f
--- /dev/null
+++ b/src/operator/pooling_v1-inl.h
@@ -0,0 +1,337 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+
+#ifndef MXNET_OPERATOR_POOLING_V1_INL_H_
+#define MXNET_OPERATOR_POOLING_V1_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace pool_v1_enum {
+enum PoolingV1OpInputs {kData};
+enum PoolingV1OpOutputs {kOut};
+enum PoolingV1OpType {kMaxPooling, kAvgPooling, kSumPooling};
+enum PoolingV1OpPadConventionType {kValid, kFull};
+}  // namespace pool_v1_enum
+
+struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
+  TShape kernel;
+  TShape stride;
+  TShape pad;
+  int pool_type;
+  int pooling_convention;
+  bool global_pool;
+  DMLC_DECLARE_PARAMETER(PoolingV1Param) {
+    DMLC_DECLARE_FIELD(global_pool).set_default(false)
+    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
+
+    DMLC_DECLARE_FIELD(kernel)
+    .enforce_nonzero()
+    .describe("pooling kernel size: (y, x) or (d, y, x)");
+
+    DMLC_DECLARE_FIELD(pool_type)
+    .add_enum("max", pool_v1_enum::kMaxPooling)
+    .add_enum("avg", pool_v1_enum::kAvgPooling)
+    .add_enum("sum", pool_v1_enum::kSumPooling)
+    .describe("Pooling type to be applied.");
+
+    DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid)
+    .add_enum("full", pool_v1_enum::kFull)
+    .add_enum("valid", pool_v1_enum::kValid)
+    .describe("Pooling convention to be applied.");
+
+    DMLC_DECLARE_FIELD(stride).set_default(TShape())
+    .enforce_nonzero()
+    .describe("stride: for pooling (y, x) or (d, y, x)");
+
+    DMLC_DECLARE_FIELD(pad).set_default(TShape())
+    .describe("pad for pooling: (y, x) or (d, y, x)");
+  }
+};
+
+template<typename xpu, typename Reducer, typename DType>
+class PoolingV1Op : public Operator {
+ public:
+  explicit PoolingV1Op(PoolingV1Param p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "3D kernel not implemented";
+    }
+    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
+    if (param_.pool_type == pool_v1_enum::kMaxPooling
+        || param_.pool_type == pool_v1_enum::kSumPooling) {
+      Assign(out,
+             req[pool_v1_enum::kOut],
+             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                           out_shape,
+                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
+                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
+                           param_.global_pool ? 1 : param_.stride[0],
+                           param_.global_pool ? 1 : param_.stride[1]));
+    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
+      Assign(out,
+             req[pool_v1_enum::kOut],
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
+             pool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                           out_shape,
+                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
+                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
+                           param_.global_pool ? 1 : param_.stride[0],
+                           param_.global_pool ? 1 : param_.stride[1]));
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(in_grad.size(), 1);
+    // TODO(bing): remove pad (0,0)
+    if (param_.kernel.ndim() == 3) {
+      LOG(FATAL) << "3D kernel not implemented";
+    }
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> grad = out_grad[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> output_data = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
+    Tensor<xpu, 4, DType> input_grad = in_grad[pool_v1_enum::kData].get<xpu, 4, DType>(s);
+
+    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
+
+    if (param_.pool_type == pool_v1_enum::kMaxPooling
+        || param_.pool_type == pool_v1_enum::kSumPooling) {
+      Assign(input_grad, req[pool_v1_enum::kData],
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
+                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
+                                  param_.global_pool ? 1 : param_.stride[0],
+                                  param_.global_pool ? 1 : param_.stride[1]),
+                  in_shape,
+                  param_.pad[0],
+                  param_.pad[1]));
+    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
+      Assign(input_grad, req[pool_v1_enum::kData],
+             scalar<DType>(1.0f / (param_.global_pool ?
+                      data.shape_[2] * data.shape_[3] :
+                      param_.kernel[0] * param_.kernel[1])) * \
+             crop(unpool<Reducer>(pad(data, param_.pad[0], param_.pad[1]),
+                                  pad(output_data, 0, 0),
+                                  pad(grad, 0, 0),
+                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
+                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
+                                  param_.global_pool ? 1 : param_.stride[0],
+                                  param_.global_pool ? 1 : param_.stride[1]),
+                  in_shape,
+                  param_.pad[0],
+                  param_.pad[1]));
+    }
+  }
+
+ private:
+  PoolingV1Param param_;
+};  // class PoolingV1Op
+
+template<typename xpu>
+Operator* CreateOp(PoolingV1Param param, int dtype);
+
+
+#if DMLC_USE_CXX11
+class PoolingV1Prop : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 2) {
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      CHECK_EQ(param_.kernel.ndim(), 3) << param_.kernel.ndim() << "D pooling not supported";
+      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
+    }
+    CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim())
+      << "stride and kernel should have the same length";
+    CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim())
+      << "pad and kernel should have the same length";
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    CHECK_EQ(in_shape->size(), 1);
+    const TShape &dshape = (*in_shape)[0];
+    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
+                               << "Or 5D in (batch, channel, d, y, x)";
+    TShape oshape = dshape;
+    if (dshape.ndim() ==  0) return false;
+    if (param_.kernel.ndim() == 2) {
+      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+      } else {
+        CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
+            << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
+            << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
+        CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
+            << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
+            << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
+        if (param_.pooling_convention == pool_v1_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+        }
+      }
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    } else if (param_.kernel.ndim() == 3) {
+      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
+      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
+      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
+      if (param_.global_pool) {
+        oshape[2] = 1;
+        oshape[3] = 1;
+        oshape[4] = 1;
+      } else {
+        if (param_.pooling_convention == pool_v1_enum::kValid) {
+          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
+                              param_.stride[0];
+          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
+                              param_.stride[1];
+          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
+                              param_.stride[2];
+        } else {
+          oshape[2] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[2] + 2 * param_.pad[0] -
+                              param_.kernel[0]) / param_.stride[0]));
+          oshape[3] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[3] + 2 * param_.pad[1] -
+                              param_.kernel[1]) / param_.stride[1]));
+          oshape[4] = 1 + static_cast<int>(ceil(static_cast<float>(
+                              dshape[4] + 2 * param_.pad[2] -
+                              param_.kernel[2]) / param_.stride[2]));
+        }
+      }
+
+      out_shape->clear();
+      out_shape->push_back(oshape);
+    }
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1);
+    int dtype = (*in_type)[0];
+
+    if (dtype == -1) {
+      LOG(FATAL) << "Input type to pooling is not specified.";
+      return false;
+    }
+
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    PoolingV1Prop *prop_sym = new PoolingV1Prop();
+    prop_sym->param_ = this->param_;
+    return prop_sym;
+  }
+
+  std::string TypeString() const override {
+    return "Pooling_v1";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {out_grad[pool_v1_enum::kOut], in_data[pool_v1_enum::kData],
+            out_data[pool_v1_enum::kOut]};
+  }
+
+  std::vector<std::pair<int, void*> > BackwardInplaceOption(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data,
+    const std::vector<void*> &in_grad) const override {
+#if MXNET_USE_CUDNN == 1
+    return {};
+#else
+    return {{in_data[pool_v1_enum::kData], in_grad[pool_v1_enum::kData]}};
+#endif
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  PoolingV1Param param_;
+};  // class PoolingV1Prop
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_POOLING_V1_INL_H_
diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc
new file mode 100644
index 000000000000..6a719bc006be
--- /dev/null
+++ b/src/operator/pooling_v1.cc
@@ -0,0 +1,90 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1.cc
+ * \brief
+ * \author Bing Xu
+*/
+#include "./pooling_v1-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(PoolingV1Param param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_v1_enum::kMaxPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_v1_enum::kAvgPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_v1_enum::kSumPooling:
+        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  })
+
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator* PoolingV1Prop::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                     std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+}
+
+DMLC_REGISTER_PARAMETER(PoolingV1Param);
+
+MXNET_REGISTER_OP_PROPERTY(Pooling_v1, PoolingV1Prop)
+.describe(R"code(This operator is DEPRECATED.
+Perform pooling on the input.
+
+The shapes for 2-D pooling is
+
+- **data**: *(batch_size, channel, height, width)*
+- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
+
+    out_height = f(height, kernel[0], pad[0], stride[0])
+    out_width = f(width, kernel[1], pad[1], stride[1])
+
+The defintion of *f* depends on ``pooling_convention``, which has two options:
+
+- **valid** (default)::
+
+    f(x, k, p, s) = floor(x+2*p-k)/s+1
+
+- **full**, which is compatible with Caffe::
+
+    f(x, k, p, s) = ceil(x+2*p-k)/s+1
+
+But ``global_pool`` is set to be true, then do a global pooling, namely reset
+``kernel=(height, width)``.
+
+Three pooling options are supported by ``pool_type``:
+
+- **avg**: average pooling
+- **max**: max pooling
+- **sum**: sum pooling
+
+1-D pooling is special case of 2-D pooling with *weight=1* and
+*kernel[1]=1*.
+
+For 3-D pooling, an additional *depth* dimension is added before
+*height*. Namely the input data will have shape *(batch_size, channel, depth,
+height, width)*.
+
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data to the pooling operator.")
+.add_arguments(PoolingV1Param::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu
new file mode 100644
index 000000000000..99aebbc6446c
--- /dev/null
+++ b/src/operator/pooling_v1.cu
@@ -0,0 +1,36 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file pooling_v1.cu
+ * \brief
+ * \author Bing Xu
+*/
+#include <vector>
+#include "./pooling_v1-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<gpu>(PoolingV1Param param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    switch (param.pool_type) {
+      case pool_v1_enum::kMaxPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::maximum, DType>(param);
+        break;
+      case pool_v1_enum::kAvgPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
+        break;
+      case pool_v1_enum::kSumPooling:
+        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
+        break;
+      default:
+        LOG(FATAL) << "unknown pooling type";
+        return NULL;
+    }
+  });
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/roi_pooling-inl.h b/src/operator/roi_pooling-inl.h
index 99f3d0c909b6..803af727039c 100644
--- a/src/operator/roi_pooling-inl.h
+++ b/src/operator/roi_pooling-inl.h
@@ -155,16 +155,16 @@ class ROIPoolingProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data, rois]";
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, rois]";
 
     // data: [batch_size, c, h, w]
     TShape dshape = in_shape->at(roipool::kData);
-    CHECK_EQ(dshape.ndim(), 4) << "data should be a 4D tensor";
+    CHECK_EQ(dshape.ndim(), 4U) << "data should be a 4D tensor";
 
     // bbox: [num_rois, 5]
     TShape bshape = in_shape->at(roipool::kBox);
-    CHECK_EQ(bshape.ndim(), 2) << "bbox should be a 2D tensor of shape [batch, 5]";
-    CHECK_EQ(bshape[1], 5) << "bbox should be a 2D tensor of shape [batch, 5]";
+    CHECK_EQ(bshape.ndim(), 2U) << "bbox should be a 2D tensor of shape [batch, 5]";
+    CHECK_EQ(bshape[1], 5U) << "bbox should be a 2D tensor of shape [batch, 5]";
 
     // out: [num_rois, c, pooled_h, pooled_w]
     // max_idx: [num_rois, c, pooled_h, pooled_w]
@@ -179,7 +179,7 @@ class ROIPoolingProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 2);
+    CHECK_EQ(in_type->size(), 2U);
     int dtype = (*in_type)[0];
     CHECK_EQ(dtype, (*in_type)[1]);
     CHECK_NE(dtype, -1) << "Input must have specified type";
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index 5205ab0e226e..849fbcef4193 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -51,8 +51,8 @@ class SequenceLastOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
 
-    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2U : 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
@@ -93,8 +93,8 @@ class SequenceLastOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2U : 1U);
 
     // break immediately if null grad
     if (req[seq_last::kData] == kNullOp) return;
@@ -162,11 +162,11 @@ class SequenceLastProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2 : 1)
+    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2U : 1U)
         << "Input:[data, sequence_length]";
 
     const TShape &dshape = (*in_shape)[seq_last::kData];
-    CHECK_GT(dshape.ndim(), 2)
+    CHECK_GT(dshape.ndim(), 2U)
         << "The data array must be of rank 3 or greater.";
     // seq length vector is same as batch size
     if (param_.use_sequence_length)
@@ -185,7 +185,7 @@ class SequenceLastProp : public OperatorProperty {
 
   bool InferType(std::vector<int> *in_type, std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index 70d24bf7c4d9..b6d9853f820a 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -53,8 +53,8 @@ class SequenceMaskOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2U : 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
@@ -85,8 +85,8 @@ class SequenceMaskOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2U : 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
@@ -147,11 +147,11 @@ class SequenceMaskProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2 : 1)
+    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2U : 1U)
         << "Input:[data, sequence_length]";
 
     const TShape &dshape = (*in_shape)[seq_mask::kData];
-    CHECK_GT(dshape.ndim(), 2)
+    CHECK_GT(dshape.ndim(), 2U)
         << "The data array must be of rank 3 or greater.";
     // seq length vector is same as batch size
     if (param_.use_sequence_length)
@@ -166,7 +166,7 @@ class SequenceMaskProp : public OperatorProperty {
 
   bool InferType(std::vector<int> *in_type, std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 5091c446a809..d7f296653dde 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -71,8 +71,8 @@ class SequenceReverseOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2U : 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
@@ -106,8 +106,8 @@ class SequenceReverseOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_EQ(in_data.size(), param_.use_sequence_length ? 2U : 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     // Get any size input + output into required form
@@ -168,11 +168,11 @@ class SequenceReverseProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2 : 1)
+    CHECK_EQ(in_shape->size(), param_.use_sequence_length ? 2U : 1U)
         << "Input:[data, sequence_length]";
 
     const TShape &dshape = (*in_shape)[seq_reverse::kData];
-    CHECK_GT(dshape.ndim(), 2)
+    CHECK_GT(dshape.ndim(), 2U)
         << "The data array must be of rank 3 or greater.";
     // seq length vector is same as batch size
     if (param_.use_sequence_length)
@@ -187,7 +187,7 @@ class SequenceReverseProp : public OperatorProperty {
 
   bool InferType(std::vector<int> *in_type, std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2 : 1);
+    CHECK_GE(in_type->size(), param_.use_sequence_length ? 2U : 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 9a87d419550a..66b29102c6a5 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -36,11 +36,13 @@ struct SliceChannelParam : public dmlc::Parameter<SliceChannelParam> {
     DMLC_DECLARE_FIELD(axis).set_default(1)
     .describe("Dimension along which to slice.");
     DMLC_DECLARE_FIELD(squeeze_axis).set_default(0)
-    .describe("If true AND the sliced dimension becomes 1, squeeze that dimension.");
+    .describe("If true, the dimension will be squeezed."
+              " Also, input.shape[axis] must be the same as `num_outputs`"
+              " when squeeze_axis is turned on.");
   }
 };  // struct SliceChannelParam
 
-template<typename xpu>
+template<typename xpu, typename DType>
 class SliceChannelOp : public Operator {
  public:
   explicit SliceChannelOp(SliceChannelParam param)
@@ -53,11 +55,9 @@ class SliceChannelOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), static_cast<size_t>(size_));
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3> > outputs(size_);
-    Tensor<xpu, 3> data;
     size_t leading = 1, trailing = 1;
     int real_axis = axis_;
     if (real_axis < 0) {
@@ -73,9 +73,11 @@ class SliceChannelOp : public Operator {
     }
     Shape<3> dshape = Shape3(leading, mid, trailing);
     Shape<3> slice_shape = Shape3(leading, mid / size_, trailing);
-    data = in_data[slice_enum::kData].get_with_shape<xpu, 3, real_t>(dshape, s);
+    Tensor<xpu, 3, DType> data = in_data[slice_enum::kData].get_with_shape<xpu, 3, DType>(
+        dshape, s);
+    std::vector<Tensor<xpu, 3, DType> > outputs(size_);
     for (int i = 0; i < size_; ++i) {
-      outputs[i] = out_data[i].get_with_shape<xpu, 3, real_t>(slice_shape, s);
+      outputs[i] = out_data[i].get_with_shape<xpu, 3, DType>(slice_shape, s);
     }
     Split(data, &outputs, 1, req);
   }
@@ -90,10 +92,8 @@ class SliceChannelOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(out_grad.size(), static_cast<size_t>(size_));
-    CHECK_EQ(in_grad.size(), 1);
+    CHECK_EQ(in_grad.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    std::vector<Tensor<xpu, 3> > grad_out(size_);
-    Tensor<xpu, 3> grad;
     size_t leading = 1, trailing = 1;
     int real_axis = axis_;
     if (real_axis < 0) {
@@ -109,9 +109,11 @@ class SliceChannelOp : public Operator {
     }
     Shape<3> dshape = Shape3(leading, mid, trailing);
     Shape<3> slice_shape = Shape3(leading, mid / size_, trailing);
-    grad = in_grad[slice_enum::kData].get_with_shape<xpu, 3, real_t>(dshape, s);
+    Tensor<xpu, 3, DType> grad = in_grad[slice_enum::kData].get_with_shape<xpu, 3, DType>(
+        dshape, s);
+    std::vector<Tensor<xpu, 3, DType> > grad_out(size_);
     for (int i = 0; i < size_; ++i) {
-      grad_out[i] = out_grad[i].get_with_shape<xpu, 3, real_t>(slice_shape, s);
+      grad_out[i] = out_grad[i].get_with_shape<xpu, 3, DType>(slice_shape, s);
     }
     Concatenate(grad_out, &grad, 1, req[slice_enum::kData]);
   }
@@ -123,7 +125,7 @@ class SliceChannelOp : public Operator {
 
 
 template<typename xpu>
-Operator *CreateOp(SliceChannelParam param);
+Operator *CreateOp(SliceChannelParam param, int dtype);
 
 
 #if DMLC_USE_CXX11
@@ -151,12 +153,28 @@ class SliceChannelProp : public OperatorProperty {
     return param_.num_outputs;
   }
 
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    out_type->clear();
+    out_type->reserve(param_.num_outputs);
+    for (int i = 0; i < param_.num_outputs; ++i) {
+      out_type->push_back(dtype);
+    }
+    aux_type->clear();
+    return true;
+  }
+
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1);
+    CHECK_EQ(in_shape->size(), 1U);
     TShape dshape = in_shape->at(slice_enum::kData);
+    TShape ishape = in_shape->at(slice_enum::kData);
     if (dshape.ndim() == 0) return false;
     if (param_.axis >= 0) {
       CHECK_LT(static_cast<size_t>(param_.axis), dshape.ndim());
@@ -167,20 +185,50 @@ class SliceChannelProp : public OperatorProperty {
     if (real_axis < 0) {
       real_axis += dshape.ndim();
     }
-    CHECK_EQ(dshape[real_axis] % param_.num_outputs, 0)
-      << "num_outputs (" << param_.num_outputs
-      << ") does not divide input dimension "
-      << real_axis << " (" << dshape[real_axis] << ").";
+    CHECK_EQ(dshape[real_axis] % param_.num_outputs, 0U)
+      << "You are trying to split the " << real_axis
+      << "-th axis of input tensor with shape " << dshape
+      << " into num_outputs=" << param_.num_outputs
+      << " evenly sized chunks, but this is not possible because "
+      << param_.num_outputs << " does not evenly divide "
+      << dshape[real_axis];
+    if (param_.squeeze_axis && ishape[real_axis] != 0) {
+      CHECK_EQ(ishape[real_axis], static_cast<size_t>(param_.num_outputs))
+        << "If squeeze axis is True, the size of the sliced axis must be the same as num_outputs."
+        << " Input shape=" << ishape << ", axis=" << real_axis
+        << ", num_outputs=" << param_.num_outputs << ".";
+    }
     dshape[real_axis] /= param_.num_outputs;
-    if (param_.squeeze_axis && dshape[real_axis] == 1) {
+    if (param_.squeeze_axis && (dshape[real_axis] == 1 || ishape[real_axis] == 0)) {
       for (int d = real_axis; d < static_cast<int>(dshape.ndim()) - 1; ++d) {
         dshape[d] = dshape[d+1];
       }
       dshape = TShape(&dshape[0], &dshape[dshape.ndim()-1]);
     }
-    out_shape->clear();
+    CHECK_EQ(static_cast<int>((*out_shape).size()), param_.num_outputs)
+      << "Size of output shape mismatch!";
     for (int i = 0; i < param_.num_outputs; ++i) {
-      out_shape->push_back(dshape);
+      SHAPE_ASSIGN_CHECK(*out_shape, i, dshape);
+      // Perform incomplete shape inference.
+      // We can back-calculate the inshape based on the out_shape.
+      TShape back_calculate_dshape = ishape;
+      if (param_.squeeze_axis && (dshape.ndim() == ishape.ndim() - 1)) {
+        for (int d = 0; d < real_axis; ++d) {
+          back_calculate_dshape[d] = (*out_shape)[i][d];
+        }
+        back_calculate_dshape[real_axis] = param_.num_outputs;
+        for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
+          back_calculate_dshape[d] = (*out_shape)[i][d - 1];
+        }
+      } else {
+        for (int d = 0; d < static_cast<int>(ishape.ndim()); ++d) {
+          back_calculate_dshape[d] = (*out_shape)[i][d];
+          if (d == real_axis) {
+            back_calculate_dshape[d] *= param_.num_outputs;
+          }
+        }
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, slice_enum::kData, back_calculate_dshape);
     }
     return true;
   }
@@ -202,7 +250,13 @@ class SliceChannelProp : public OperatorProperty {
     return out_grad;
   }
 
-  Operator* CreateOperator(Context ctx) const override;
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return nullptr;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
 
  private:
   SliceChannelParam param_;
diff --git a/src/operator/slice_channel.cc b/src/operator/slice_channel.cc
index d733e3d867d0..a5c5a6246118 100644
--- a/src/operator/slice_channel.cc
+++ b/src/operator/slice_channel.cc
@@ -10,21 +10,54 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(SliceChannelParam param) {
-  return new SliceChannelOp<cpu>(param);
+Operator* CreateOp<cpu>(SliceChannelParam param, int dtype) {
+  Operator* op = nullptr;
+  MSHADOW_TYPE_SWITCH(dtype, DType, {
+    op = new SliceChannelOp<cpu, DType>(param);
+  })
+  return op;
 }
 
-Operator* SliceChannelProp::CreateOperator(Context ctx) const {
-  DO_BIND_DISPATCH(CreateOp, param_);
+Operator* SliceChannelProp::CreateOperatorEx(Context ctx,
+                                             std::vector<TShape>* in_shape,
+                                             std::vector<int>* in_type) const {
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(SliceChannelParam);
 
 MXNET_REGISTER_OP_PROPERTY(SliceChannel, SliceChannelProp)
-.describe("Slice input equally along specified axis")
-.set_return_type("Symbol[]")
+.describe(R"code(Split an array along a particular axis into multiple sub-arrays.
+
+Assume the input array has shape ``(d_0, ..., d_n)`` and we slice it into *m*
+(``num_outputs=m``) subarrays along axis *k*, then we will obtain a list of *m*
+arrays with each of which has shape ``(d_0, ..., d_k/m, ..., d_n)``.
+
+For example::
+
+  x = [[1, 2],
+       [3, 4],
+       [5, 6],
+       [7, 8]]  // 4x2 array
+
+  y = split(x, axis=0, num_outputs=4) // a list of 4 arrays
+  y[0] = [[ 1.,  2.]]  // 1x2 array
+
+  z = split(x, axis=0, num_outputs=2) // a list of 2 arrays
+  z[0] = [[ 1.,  2.],
+          [ 3.,  4.]]
+
+When setting optional argument ``squeeze_axis=1``, then the *k*-dimension will
+be removed from the shape if it becomes 1::
+
+  y = split(x, axis=0, num_outputs=4, squeeze_axis=1)
+  y[0] = [ 1.,  2.]  // (2,) vector
+
+)code" ADD_FILELINE)
+.set_return_type("ndarray-or-symbol[]")
 .add_arguments(SliceChannelParam::__FIELDS__());
 
+NNVM_REGISTER_OP(SliceChannel).add_alias("split");
+
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/slice_channel.cu b/src/operator/slice_channel.cu
index 579c7e444f92..6afd45003ed3 100644
--- a/src/operator/slice_channel.cu
+++ b/src/operator/slice_channel.cu
@@ -10,8 +10,12 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(SliceChannelParam param) {
-  return new SliceChannelOp<gpu>(param);
+Operator* CreateOp<gpu>(SliceChannelParam param, int dtype) {
+  Operator* op = nullptr;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new SliceChannelOp<gpu, DType>(param);
+  })
+  return op;
 }
 
 }  // namespace op
diff --git a/src/operator/softmax_activation-inl.h b/src/operator/softmax_activation-inl.h
index 222e62522d00..19052371ad5f 100644
--- a/src/operator/softmax_activation-inl.h
+++ b/src/operator/softmax_activation-inl.h
@@ -62,8 +62,8 @@ class SoftmaxActivationOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(in_data.size(), 1U);
+    CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.mode == softmax_activation::kInstance) {
       Tensor<xpu, 2> data = in_data[softmax_activation::kData].FlatTo2D<xpu, real_t>(s);
@@ -92,9 +92,9 @@ class SoftmaxActivationOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
-    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(req.size(), 1U);
     // Use 3d tensor for both mode -> {instance, channel}. Get shapes
     int total_size = in_grad[softmax_activation::kData].Size();
     int batch_size = in_grad[softmax_activation::kData].shape_[0];
@@ -140,7 +140,7 @@ class SoftmaxActivationProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1) << "Input:[data]";
+    CHECK_EQ(in_shape->size(), 1U) << "Input:[data]";
     const TShape &dshape = in_shape->at(softmax_activation::kData);
     if (dshape.ndim() == 0) return false;
     out_shape->clear();
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index 99f909108907..02cca47c96bf 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -40,12 +40,11 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
     DMLC_DECLARE_FIELD(ignore_label).set_default(-1.0f)
-    .describe("the label value will be ignored during backward (only works if "
-      "use_ignore is set to be true).");
+    .describe("the labels with value equals to ``ignore_label`` will be ignored "
+              "during backward (only works if "
+              "use_ignore is set to be true).");
     DMLC_DECLARE_FIELD(multi_output).set_default(false)
-    .describe("If set to true, for a (n,k,x_1,..,x_n) dimensional "
-      "input tensor, softmax will generate n*x_1*...*x_n output, each "
-      "has k classes");
+    .describe("If set to true, softmax will applied on axis 1");
     DMLC_DECLARE_FIELD(use_ignore).set_default(false)
     .describe("If set to true, the ignore_label value will not contribute "
       "to the backward gradient");
@@ -61,9 +60,7 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
     .add_enum("batch", softmaxout_enum::kBatch)
     .add_enum("valid", softmaxout_enum::kValid)
     .set_default(softmaxout_enum::kNull)
-    .describe("If set to null, op will do nothing on output gradient."
-              "If set to batch, op will normalize gradient by divide batch size"
-              "If set to valid, op will normalize gradient by divide sample not ignored");
+    .describe("Normalize the gradient");
     DMLC_DECLARE_FIELD(out_grad)
     .set_default(false)
     .describe("Apply weighting from output gradient");
@@ -82,8 +79,8 @@ class SoftmaxOutputOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2) << "SoftmaxOutput Input: [data, label]";
-    CHECK_EQ(out_data.size(), 1) << "SoftmaxOutput Output: [output]";
+    CHECK_EQ(in_data.size(), 2U) << "SoftmaxOutput Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1U) << "SoftmaxOutput Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
       int n = in_data[softmaxout_enum::kData].size(0);
@@ -121,10 +118,10 @@ class SoftmaxOutputOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_GE(in_grad.size(), 1);
-    CHECK_GE(req.size(), 1);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_GE(in_grad.size(), 1U);
+    CHECK_GE(req.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     if (out_data[softmaxout_enum::kOut].shape_ ==
@@ -304,7 +301,7 @@ class SoftmaxOutputProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
 
@@ -344,7 +341,7 @@ class SoftmaxOutputProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
+    CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index 439a400b4f99..982584b59d92 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -30,15 +30,58 @@ Operator *SoftmaxOutputProp::CreateOperatorEx(Context ctx, std::vector<TShape> *
 DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
 
 MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
-.describe("Perform a softmax transformation on input, backprop with logloss.")
-.add_argument("data", "Symbol", "Input data to softmax.")
-.add_argument("label", "Symbol", "Label data, can also be "\
-              "probability value with same shape as data")
+.describe(R"code(Softmax with logit loss.
+
+In the forward pass, the softmax output is returned. Assume the input data has
+shape *(n,k)*, then the output will have the same shape as the input, which is computed by
+
+.. math::
+   out[i,:] = softmax(data[i,:])
+
+for :math:`i=0,...,n-1`, where
+
+.. math::
+   softmax(x) = \left[..., \frac{exp(x[j])}{exp(x[0])+...+exp(x[k-1])}, ...\right]
+
+For general *N*-D input array with shape :math:`(d_1, ..., d_n)`. Denoted by the size
+:math:`s=d_1d_2...d_n`. The way to compute softmax various:
+
+- ``preserve_shape`` is false (default). Reshape input into a 2-D array with
+  shape :math:`(d_1, s/d_1)` beforing computing the softmax, and then reshaped back to the
+  original shape.
+
+- ``preserve_shape`` is true. For all :math:`i_1, ..., i_{n-1}`, compute
+
+  .. math::
+    out[i_1, ..., i_{n-1}, :] = softmax(data[i_1, ..., i_{n-1},:])
+
+- ``multi_output`` is true. For all :math:`i_1, ..., i_{n-1}`, compute
+
+  .. math::
+    out[i_1, :, ..., i_{n-1}] = softmax(data[i_1, :, ..., i_{n-1}])
+
+In the backward pass, the logit loss, also called cross-entroy loss, is
+added. The provided label can be a *(N-1)*-D label index array or a *N*-D label
+probability array.
+
+Examples with a particular label can be ignored during backward by specifying
+``ignore_label`` (also need ``use_ignore`` to be true).
+
+A scale can be applied to the gradient by ``grad_scale``, which is often used in
+mutli-loss object function in which we can given each loss different weight. It
+also supports various ways to normalize the gradient by ``normalization``:
+
+- **null**: do nothing
+- **batch**: divide by batch size (number of examples)
+- **valid**: divide by the number of examples which are not ignored.
+)code" ADD_FILELINE)
+.add_argument("data", "ndarray-or-symbol", "Input data.")
+.add_argument("label", "ndarray-or-symbol", "Ground truth label.")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
 .describe("DEPRECATED: Perform a softmax transformation on input. Please use SoftmaxOutput")
-.add_argument("data", "Symbol", "Input data to softmax.")
+.add_argument("data", "ndarray-or-symbol", "Input data to softmax.")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
index 74d35ffd7b9e..5f76b7f1218b 100644
--- a/src/operator/spatial_transformer-inl.h
+++ b/src/operator/spatial_transformer-inl.h
@@ -59,8 +59,8 @@ class SpatialTransformerOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 3U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> data = in_data[st::kData].get<xpu, 4, DType>(s);
     Tensor<xpu, 4, DType> out = out_data[st::kOut].get<xpu, 4, DType>(s);
@@ -99,8 +99,8 @@ class SpatialTransformerOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 3U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> data = in_data[st::kData].get<xpu, 4, DType>(s);
     Tensor<xpu, 4, DType> grad = out_grad[st::kOut].get<xpu, 4, DType>(s);
@@ -158,25 +158,25 @@ class SpatialTransformerProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data, loc]";
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, loc]";
     CHECK_EQ(param_.transform_type, st::kAffine) << "only supports affine transform currently";
     CHECK_EQ(param_.sampler_type, st::kBilinear) << "only supports bilinear sampling currently";
     const TShape &dshape = (*in_shape)[st::kData];
     const TShape &lshape = (*in_shape)[st::kLoc];
     if (dshape.ndim() ==  0) return false;
-    CHECK_EQ(dshape.ndim(), 4) \
+    CHECK_EQ(dshape.ndim(), 4U) \
         << "input data should be 4D in batch-num_filter-y-x";
     if (lshape.ndim() ==  0) return false;
-    CHECK_EQ(lshape.ndim(), 2) \
+    CHECK_EQ(lshape.ndim(), 2U) \
         << "locolisation paramter should be 4D in batch-num_hidden";
     if (param_.transform_type == st::kAffine) {
-      CHECK_EQ(lshape[1], 6) << "incorrect locolisation network shape[1], should be 6";
+      CHECK_EQ(lshape[1], 6U) << "incorrect locolisation network shape[1], should be 6";
     }
     out_shape->clear();
     out_shape->push_back(dshape);
-    CHECK_GT(param_.target_shape[0], 0) \
+    CHECK_GT(param_.target_shape[0], 0U) \
         << "incorrect target_shape: " << param_.target_shape[0];
-    CHECK_GT(param_.target_shape[1], 0) \
+    CHECK_GT(param_.target_shape[1], 0U) \
         << "incorrect target_shape: " << param_.target_shape[1];
     (*out_shape)[st::kOut][2] = param_.target_shape[0];
     (*out_shape)[st::kOut][3] = param_.target_shape[1];
diff --git a/src/operator/spatial_transformer.cc b/src/operator/spatial_transformer.cc
index 13ecfd63e0f3..ac0bfa20e423 100644
--- a/src/operator/spatial_transformer.cc
+++ b/src/operator/spatial_transformer.cc
@@ -17,9 +17,9 @@ inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
   const DType *grid = grid_src.dptr_;
   int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
   int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
-  for (index_t n = 0; n < o_n; ++n) {
-    for (index_t c = 0; c < o_c; ++c) {
-      for (index_t h = 0; h < o_h; ++h) {
+  for (index_t n = 0; n < static_cast<index_t>(o_n); ++n) {
+    for (index_t c = 0; c < static_cast<index_t>(o_c); ++c) {
+      for (index_t h = 0; h < static_cast<index_t>(o_h); ++h) {
         for (index_t w = 0; w < o_w; ++w) {
           index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
           index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
@@ -56,9 +56,9 @@ inline void BilinearSamplingBackward(const Tensor<cpu, 4, DType> &input_grad,
   int o_n = output_grad.size(0), o_c = output_grad.size(1),
       o_h = output_grad.size(2), o_w = output_grad.size(3);
   int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
-  for (index_t n = 0; n < o_n; ++n) {
-     for (index_t h = 0; h < o_h; ++h) {
-        for (index_t w = 0; w < o_w; ++w) {
+  for (index_t n = 0; n < static_cast<index_t>(o_n); ++n) {
+     for (index_t h = 0; h < static_cast<index_t>(o_h); ++h) {
+        for (index_t w = 0; w < static_cast<index_t>(o_w); ++w) {
           DType top_left_y_gw = 0.0;
           DType top_left_x_gw = 0.0;
           index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index 1221bf923cda..4fc4e72ecc10 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -55,9 +55,9 @@ class SVMOutputOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2) << "Expecting [data, label]";
-    CHECK_EQ(out_data.size(), 1) << "Expecting [output]";
-    CHECK_EQ(req.size(), 1) << "Expecting output.size() == req.size()";
+    CHECK_EQ(in_data.size(), 2U) << "Expecting [data, label]";
+    CHECK_EQ(out_data.size(), 1U) << "Expecting [output]";
+    CHECK_EQ(req.size(), 1U) << "Expecting output.size() == req.size()";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> data = in_data[svm_enum::kData].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
@@ -73,10 +73,10 @@ class SVMOutputOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_GE(in_grad.size(), 1);
-    CHECK_GE(req.size(), 1);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_grad.size(), 1U);
+    CHECK_GE(in_grad.size(), 1U);
+    CHECK_GE(req.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const TShape& label_shape = in_data[svm_enum::kLabel].shape_;
 
@@ -120,7 +120,7 @@ class SVMOutputProp : public OperatorProperty {
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2) << "Input:[data, label]";
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     TShape label_shape(dshape.ndim() - 1);
@@ -135,7 +135,7 @@ class SVMOutputProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
+    CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
index 589eac75f458..d4b959683287 100644
--- a/src/operator/svm_output.cu
+++ b/src/operator/svm_output.cu
@@ -6,24 +6,80 @@
 */
 
 #include "./svm_output-inl.h"
+#include <device_launch_parameters.h>
+#include "mshadow/tensor.h"
+
 
 namespace mshadow {
-  template<typename DType>
-  inline void L1_SVM(const DType & margin,
+
+template<int n_bits, typename DType>
+__global__  void L1_SVMKernel(const DType margin,
+                              const DType reg_coef,
+                              Tensor<gpu, 2, DType> dst,
+                              const Tensor<gpu, 1, DType> label,
+                              const Tensor<gpu, 2, DType> src) {
+  const index_t nmax = dst.size(1);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+  const index_t k = static_cast<int>(label[y]);
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    if (n_index == k) {
+      dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
+    } else {
+      dst[y][n_index] = DType(margin > -src[y][n_index]) * reg_coef;
+    }
+  }
+}
+
+template<typename DType>
+inline void L1_SVM(const DType & margin,
                    const DType & reg_coef,
                    Tensor<gpu, 2, DType> dst,
                    const Tensor<gpu, 1, DType> & label,
                    const Tensor<gpu, 2, DType> & src) {
-    LOG(FATAL) << "Not Implemented.";
-  }
-  template<typename DType>
-  inline void L2_SVM(const DType & margin,
-               const DType & reg_coef,
-               Tensor<gpu, 2, DType> dst,
-               const Tensor<gpu, 1, DType> & label,
-               const Tensor<gpu, 2, DType> & src) {
-    LOG(FATAL) << "Not Implemented.";
+  dim3 dimBlock(cuda::kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  L1_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
+    (margin, reg_coef, dst, label, src);
+}
+
+
+template<int n_bits, typename DType>
+__global__  void L2_SVMKernel(const DType margin,
+                              const DType reg_coef,
+                              Tensor<gpu, 2, DType> dst,
+                              const Tensor<gpu, 1, DType> label,
+                              const Tensor<gpu, 2, DType> src) {
+  const index_t nmax = dst.size(1);
+  const unsigned n_size = 1 << n_bits;
+  const int y = blockIdx.x;
+  const int n = threadIdx.x;
+  const index_t k = static_cast<int>(label[y]);
+  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
+    if (n_index == k) {
+      dst[y][k] = margin > src[y][k] ? 2 * (margin - src[y][k]) : DType(0.0f);
+      dst[y][k] *= -reg_coef;
+    } else {
+      dst[y][n_index] = margin > -src[y][n_index] ? (-2)*(margin + src[y][n_index]) : DType(0.0f);
+      dst[y][n_index] *= -reg_coef;
+    }
   }
+}
+
+template<typename DType>
+inline void L2_SVM(const DType & margin,
+                   const DType & reg_coef,
+                   Tensor<gpu, 2, DType> dst,
+                   const Tensor<gpu, 1, DType> & label,
+                   const Tensor<gpu, 2, DType> & src) {
+  dim3 dimBlock(cuda::kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  L2_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
+    (margin, reg_coef, dst, label, src);
+}
 }  // namespace mshadow
 
 namespace mxnet {
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index 28aa5acdf015..9595f6e93884 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -159,7 +159,7 @@ class SwapAxisProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
-    CHECK_EQ(in_shape->size(), 1);
+    CHECK_EQ(in_shape->size(), 1U);
 
     TShape &shape0 = (*in_shape)[swapaxisenum::kData];
     out_shape->clear();
@@ -174,7 +174,7 @@ class SwapAxisProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1);
+    CHECK_EQ(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "Input must have specified type";
     out_type->clear();
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index d2570da6a400..5cde7ede5751 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -32,8 +32,28 @@ Operator* SwapAxisProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_sh
 DMLC_REGISTER_PARAMETER(SwapAxisParam);
 
 MXNET_REGISTER_OP_PROPERTY(SwapAxis, SwapAxisProp)
-.add_argument("data", "Symbol", "Input data to the SwapAxisOp.")
+.add_argument("data", "ndarray-or-symbol", "Input array.")
 .add_arguments(SwapAxisParam::__FIELDS__())
-.describe("Apply swapaxis to input.");
+.describe(R"code(Interchange two axes of an array.
+
+Examples::
+
+  x = [[1, 2, 3]])
+  swapaxes(x, 0, 1) = [[ 1],
+                       [ 2],
+                       [ 3]]
+
+  x = [[[ 0, 1],
+        [ 2, 3]],
+       [[ 4, 5],
+        [ 6, 7]]]  // (2,2,2) array
+
+ swapaxes(x, 0, 2) = [[[ 0, 4],
+                       [ 2, 6]],
+                      [[ 1, 5],
+                       [ 3, 7]]]
+)code" ADD_FILELINE);
+
+NNVM_REGISTER_OP(SwapAxis).add_alias("swapaxes");
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index 6dcb4ac4a9bc..2ba0eb5cec17 100755
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -7,18 +7,15 @@
 #ifndef MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_
 #define MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_
 
-template<typename DType>
-using CTensor = Tensor<gpu, MAX_DIM, DType>;
-
 using namespace mshadow::cuda;
 
-template<typename DType, typename OP, int unroll>
+template<int ndim, typename DType, typename OP, int unroll>
 __launch_bounds__(kMaxThreadsPerBlock)
 __global__ void binary_broadcast_kernel(const int N, const bool addto,
                                         const DType* __restrict lhs,
                                         const DType* __restrict rhs, DType *out,
-                                        const CShape lstride, const CShape rstride,
-                                        const CShape oshape) {
+                                        const Shape<ndim> lstride, const Shape<ndim> rstride,
+                                        const Shape<ndim> oshape) {
   for (int idx = blockIdx.x * blockDim.x * unroll + threadIdx.x; idx < N;
     idx += blockDim.x * gridDim.x * unroll)
   {
@@ -38,7 +35,7 @@ __global__ void binary_broadcast_kernel(const int N, const bool addto,
   }
 }
 
-template<typename DType, typename OP>
+template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<gpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
   if (req == kNullOp) return;
@@ -48,127 +45,174 @@ void BinaryBroadcastComputeImpl(Stream<gpu> *s, const OpReqType req,
   const int unroll = 2;
   int nthread = std::min(kMaxThreadsPerBlock, ((N + warpSize - 1)/warpSize)*warpSize );
   int ngrid = std::min(kBaseGridNum, (N + nthread*unroll - 1) / (nthread*unroll));
-  CShape lstride = calc_stride(lhs.shape_.get<MAX_DIM>());
-  CShape rstride = calc_stride(rhs.shape_.get<MAX_DIM>());
-  binary_broadcast_kernel<DType, OP, unroll><<<ngrid, nthread, 0, stream>>>(
+  Shape<ndim> lstride = calc_stride(lhs.shape_.get<ndim>());
+  Shape<ndim> rstride = calc_stride(rhs.shape_.get<ndim>());
+  binary_broadcast_kernel<ndim, DType, OP, unroll><<<ngrid, nthread, 0, stream>>>(
     N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(), out.dptr<DType>(), lstride, rstride,
-    out.shape_.get<MAX_DIM>());
+    out.shape_.get<ndim>());
 }
 
-template<typename Reducer, typename DType, typename OP, int x_bits, int unroll>
-__launch_bounds__(kMaxThreadsPerBlock)
-__global__ void par_reduce_kernel(const int N, const int M, const bool addto,
-                                  const DType* __restrict big, DType *small, const CShape bshape,
-                                  const CShape sshape, const CShape rshape, const CShape rstride,
-                                  const int Mnext) {
-  __shared__ DType buf[1<<x_bits];
-
+const int nthread_reduce = kMaxThreadsPerBlock;
+template<typename Reducer, int ndim, typename DType, typename OP, int unroll>
+__launch_bounds__(nthread_reduce)
+__global__ void reduce_kernel(const int N, const int M, const bool addto,
+                              const DType* __restrict big, DType *small,
+                              const Shape<ndim> big_shape0, const Shape<ndim> small_shape,
+                              const Shape<ndim> big_shape, const Shape<ndim> big_stride,
+                              const int Mnext, const bool do_transpose) {
+  extern __shared__ char shTileChar[];
+  DType* shTile = (DType*)(shTileChar);
+  const int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  const int bx = (do_transpose) ? blockDim.y : blockDim.x;
+  const int by = (do_transpose) ? blockDim.x : blockDim.y;
+  const int tidx = (do_transpose) ? tid / by : threadIdx.x;
+  const int tidy = (do_transpose) ? tid % by : threadIdx.y;
   for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) {
     // This TB handles M range [Mstart, ...., Mend - 1]
     const int Mstart = (int)((uint64_t)M*(uint64_t)m0/(uint64_t)Mnext);
     const int Mend   = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext);
-    for (int idx = blockIdx.x; idx < N; idx += gridDim.x) {
-      CShape coord = unravel(idx, sshape);
-      int j = ravel(coord, bshape);
+    for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
+      int idx = idx0 + tidx;
+      Shape<ndim> coord = unravel(idx, small_shape);
+      int idx_big0 = ravel(coord, big_shape0);
 
       DType val;
       Reducer::SetInitValue(val);
-      for (int k = threadIdx.x + Mstart; k < Mend; k += blockDim.x*unroll) {
-        int kdot[unroll];
-        #pragma unroll
-        for (int u=0;u < unroll;u++) {
-          kdot[u] = unravel_dot(k + u*blockDim.x, rshape, rstride);
-        }
-        DType tmp[unroll];
-        #pragma unroll
-        for (int u=0;u < unroll;u++) {
-          if (k + u*blockDim.x < Mend) tmp[u] = OP::Map(big[j + kdot[u]]);
-        }
-        #pragma unroll
-        for (int u=0;u < unroll;u++) {
-          if (k + u*blockDim.x < Mend) Reducer::Reduce(val, tmp[u]);
+      if (idx < N) {
+        for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
+          int idx_big[unroll];
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride);
+          }
+          DType tmp[unroll];
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            if (k + u*by < Mend) {
+              tmp[u] = OP::Map(big[idx_big[u]]);
+            }
+          }
+          #pragma unroll
+          for (int u=0;u < unroll;u++) {
+            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u]);
+          }
         }
       }
-      buf[threadIdx.x] = val;
 
-      __syncthreads();
-      Reduce1D<Reducer, x_bits>(buf);
-      if (threadIdx.x == 0) {
-        assign(&small[idx + m0*N], addto, buf[0]);
+      // Shared memory block bx * by. Reduction is along by. Final result is in tidy=0
+      if (by > 1) {
+        // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
+        const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx + 1) : bx;
+        const int it0 = tidx + tidy*fbx;
+        shTile[it0] = val;
+        __syncthreads();
+        for (int t=1;t < by;t <<= 1) {
+          DType tmp;
+          Reducer::SetInitValue(tmp);
+          if (tidy + t < by) tmp = shTile[it0 + t*fbx];
+          __syncthreads();
+          Reducer::Reduce(shTile[it0], tmp);
+          __syncthreads();
+        }
+        if (idx < N && tidy == 0) {
+          assign(&small[idx + m0*N], addto, shTile[tidx]);
+        }
+      } else {
+        if (idx < N) {
+          assign(&small[idx + m0*N], addto, val);
+        }        
       }
     }
   }
 
 }
 
-const int nthread_reduce = kMaxThreadsPerBlock;
-template<typename Reducer, typename DType, typename OP, int unroll>
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2, int unroll>
 __launch_bounds__(nthread_reduce)
 __global__ void reduce_kernel(const int N, const int M, const bool addto,
-                              const DType* __restrict big, DType *small, const CShape bshape,
-                              const CShape sshape, const CShape rshape, const CShape rstride,
-                              const int Mnext) {
-  // Size of shared memory is blockDim.x*( (blockDim.y > 1) ? blockDim.y : 0 )*sizeof(DType)
+                              const DType* __restrict big, const DType* __restrict lhs,
+                              const DType* __restrict rhs, DType *small,
+                              const Shape<ndim> big_shape0, const Shape<ndim> lhs_shape0,
+                              const Shape<ndim> rhs_shape0, const Shape<ndim> small_shape,
+                              const Shape<ndim> big_shape, const Shape<ndim> lhs_shape,
+                              const Shape<ndim> rhs_shape, const Shape<ndim> big_stride,
+                              const Shape<ndim> lhs_stride, const Shape<ndim> rhs_stride,
+                              const int Mnext, const bool do_transpose) {
   extern __shared__ char shTileChar[];
   DType* shTile = (DType*)(shTileChar);
-  const int it0 = threadIdx.x + threadIdx.y*blockDim.x;
+  const int tid = threadIdx.x + threadIdx.y*blockDim.x;
+  const int bx = (do_transpose) ? blockDim.y : blockDim.x;
+  const int by = (do_transpose) ? blockDim.x : blockDim.y;
+  const int tidx = (do_transpose) ? tid / by : threadIdx.x;
+  const int tidy = (do_transpose) ? tid % by : threadIdx.y;
   for (int m0 = blockIdx.y; m0 < Mnext; m0 += gridDim.y) {
     // This TB handles M range [Mstart, ...., Mend - 1]
     const int Mstart = (int)((uint64_t)M*(uint64_t)m0/(uint64_t)Mnext);
     const int Mend   = (int)((uint64_t)M*(uint64_t)(m0 + 1)/(uint64_t)Mnext);
-    for (int idx0 = blockIdx.x*blockDim.x; idx0 < N; idx0 += blockDim.x*gridDim.x) {
-      int idx = idx0 + threadIdx.x;
-      CShape coord = unravel(idx, sshape);
-      int j = ravel(coord, bshape);
+    for (int idx0 = blockIdx.x*bx; idx0 < N; idx0 += bx*gridDim.x) {
+      int idx = idx0 + tidx;
+      Shape<ndim> coord = unravel(idx, small_shape);
+      int idx_big0 = ravel(coord, big_shape0);
+      int idx_lhs0 = ravel(coord, lhs_shape0);
+      int idx_rhs0 = ravel(coord, rhs_shape0);
 
       DType val;
       Reducer::SetInitValue(val);
       if (idx < N) {
-        for (int k = threadIdx.y + Mstart; k < Mend; k += blockDim.y*unroll) {
-          int kdot[unroll];
+        for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
+          int idx_big[unroll];
+          int idx_lhs[unroll];
+          int idx_rhs[unroll];
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            kdot[u] = unravel_dot(k + u*blockDim.y, rshape, rstride);
+            idx_big[u] = idx_big0 + unravel_dot(k + u*by, big_shape, big_stride);
+            idx_lhs[u] = idx_lhs0 + unravel_dot(k + u*by, lhs_shape, lhs_stride);
+            idx_rhs[u] = idx_rhs0 + unravel_dot(k + u*by, rhs_shape, rhs_stride);
           }
           DType tmp[unroll];
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            if (k + u*blockDim.y < Mend) tmp[u] = OP::Map(big[j + kdot[u]]);
+            if (k + u*by < Mend) {
+              tmp[u] = OP1::Map(big[idx_big[u]], OP2::Map(lhs[idx_lhs[u]], rhs[idx_rhs[u]]));
+            }
           }
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            if (k + u*blockDim.y < Mend) Reducer::Reduce(val, tmp[u]);
+            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u]);
           }
         }
       }
 
-      if (blockDim.y > 1) {
+      // Shared memory block bx * by. Reduction is along by. Final result is in tidy=0
+      if (by > 1) {
+        // Fix bx to avoid bank conflicts. Assumes warpSize number of banks
+        const int fbx = (do_transpose && ((bx & (warpSize - 1)) == 0)) ? (bx + 1) : bx;
+        const int it0 = tidx + tidy*fbx;
         shTile[it0] = val;
         __syncthreads();
-        for (int t=1;t < blockDim.y;t <<= 1) {
+        for (int t=1;t < by;t <<= 1) {
           DType tmp;
           Reducer::SetInitValue(tmp);
-          if (threadIdx.y + t < blockDim.y) tmp = shTile[it0 + t*blockDim.x];
+          if (tidy + t < by) tmp = shTile[it0 + t*fbx];
           __syncthreads();
           Reducer::Reduce(shTile[it0], tmp);
           __syncthreads();
         }
-        if (idx < N && threadIdx.y == 0) {
-          assign(&small[idx + m0*N], addto, shTile[threadIdx.x]);
+        if (idx < N && tidy == 0) {
+          assign(&small[idx + m0*N], addto, shTile[tidx]);
         }
       } else {
         if (idx < N) {
           assign(&small[idx + m0*N], addto, val);
         }        
       }
-
     }
   }
 
 }
 
 // Simple reduction of lines when M is small
-template<typename Reducer, typename DType, typename OP>
+template<typename Reducer, typename DType>
 __launch_bounds__(kMaxThreadsPerBlock)
 __global__ void reduce_lines_kernel(const int N, const int M, const bool addto,
   const int small_in_stride, const DType* __restrict small_in, DType *small_out) {
@@ -177,7 +221,7 @@ __global__ void reduce_lines_kernel(const int N, const int M, const bool addto,
     DType val;
     Reducer::SetInitValue(val);
     for (int k = 0; k < M; k++) {
-      Reducer::Reduce(val, OP::Map(small_in[idx + k*small_in_stride]));
+      Reducer::Reduce(val, small_in[idx + k*small_in_stride]);
     }
 
     if (idx < N) {
@@ -187,150 +231,375 @@ __global__ void reduce_lines_kernel(const int N, const int M, const bool addto,
   }
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP>
 __global__ void reduce_kernel_M1(const int N, const bool addto,
-                                const DType* __restrict big, DType *small, const CShape bshape,
-                                const CShape sshape) {
+                                const DType* __restrict big, DType *small, const Shape<ndim> bshape,
+                                const Shape<ndim> sshape) {
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
-    CShape coord = unravel(idx, sshape);
+    Shape<ndim> coord = unravel(idx, sshape);
     int j = ravel(coord, bshape);
     assign(&small[idx], addto, OP::Map(big[j]));
   }
 }
 
-template<typename Reducer, typename DType, typename OP>
-size_t ReduceImpl(Stream<gpu> *s, const TBlob& small, const OpReqType req,
-                  const TBlob& big, const Tensor<gpu, 1, char>& workspace,
-                  const bool getWorkspaceSize) {
-  size_t workspace_pos = 0;
-  if (req == kNullOp) return workspace_pos;
-  cudaStream_t stream = Stream<gpu>::GetStream(s);
-  CShape rshape, rstride;
-  int mdim = diff(small.shape_.get<MAX_DIM>(), big.shape_.get<MAX_DIM>(), &rshape, &rstride);
-  int N = small.shape_.Size(), M = rshape.Size();
-
-  const int warpSize = 32;
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
+__global__ void reduce_kernel_M1(const int N, const bool addto,
+                                 const DType* __restrict big,
+                                 const DType* __restrict lhs,
+                                 const DType* __restrict rhs,
+                                 DType *small,
+                                 const Shape<ndim> big_shape,
+                                 const Shape<ndim> lhs_shape,
+                                 const Shape<ndim> rhs_shape,
+                                 const Shape<ndim> small_shape) {
+  for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
+    Shape<ndim> coord = unravel(idx, small_shape);
+    int idx_big = ravel(coord, big_shape);
+    int idx_lhs = ravel(coord, lhs_shape);
+    int idx_rhs = ravel(coord, rhs_shape);
+    DType val = OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs]));
+    assign(&small[idx], addto, val);
+  }
+}
 
-  if (M == 1) {
-    if (!getWorkspaceSize) {
-      dim3 blockDim;
-      dim3 gridDim;
-      blockDim.x = kMaxThreadsPerBlock;
-      gridDim.x = std::min((unsigned int)kBaseGridNum, (N + blockDim.x - 1)/blockDim.x);
-      reduce_kernel_M1<Reducer, DType, OP><<< gridDim, blockDim, 0, stream>>>(
-        N, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<MAX_DIM>(),
-        small.shape_.get<MAX_DIM>());
+// Returns the stride with which the fastest dimension is moving.
+// Used to detect memory access scatter.
+template<int ndim>
+MSHADOW_XINLINE int fastest_stride(const Shape<ndim>& small, const Shape<ndim>& big, 
+  const Shape<ndim>& big_stride) {
+  for (int i = ndim-1; i >= 0; --i) {
+    if (big[i] != 1) {
+      return (small[i] == big[i]) ? 1 : big_stride[i];
     }
-  } else {
+  }
+  return 1;
+}
 
+// Returns a/b integer division rounded up
+template<typename Type>
+Type ceil_idiv(const Type a, const Type b) {
+  return (a + b - 1)/b;
+}
+
+// Configuration for ReduceImpl()
+template<int ndim>
+struct ReduceImplConfig {
+  static const int warpSize = 32;
+  static const int unroll_reduce = 2;
+  static const int maxLoopPerTB = 64;
+  int N;
+  int M;
+  int Mnext;
+  struct {
     dim3 blockDim;
     dim3 gridDim;
+    int shMemSize;
+    bool do_transpose;
+  } kernel_1;
+  struct {
+    int blockSize;
+    int gridSize;
+  } kernel_2;
+  size_t workspace_size;
+
+  Shape<ndim> rshape, rstride;
+  Shape<ndim> lhs_shape, lhs_stride;
+  Shape<ndim> rhs_shape, rhs_stride;
+};
+
+static inline uint64_t calc_num_load(const int X, const int Y, const int* strides) {
+  const int warpSize = ReduceImplConfig<1>::warpSize;
+  // Number of full warps
+  uint64_t num_full_warp = X / warpSize;
+  // Length of the partial warp i.e. number of threads that are performing loads
+  uint64_t len_part_warp = X % warpSize;
+
+  uint64_t num_load_full = (std::min(warpSize, strides[0]) +
+    std::min(warpSize, strides[1]) +
+    std::min(warpSize, strides[2]))*num_full_warp;
+
+  uint64_t num_load_part =
+  (std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[0], warpSize)) +
+    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[1], warpSize)) +
+    std::min(len_part_warp, ceil_idiv<uint64_t>(len_part_warp*strides[2], warpSize)))*
+  (len_part_warp != 0);
+
+  uint64_t num_load = (num_load_full + num_load_part)*(uint64_t)Y;
+  return num_load;
+}
 
-    const int kMaxThreadBits = 10;
-    const int par_reduce_lim = 32;
-    const int unroll_par_reduce = 2;
-    const int unroll_reduce = 4;
-
-    int Mnext = 1;
-    if (N <= par_reduce_lim) {
-      blockDim.x = 1 << kMaxThreadBits;
-      gridDim.x = std::min(kBaseGridNum, N);
-      gridDim.y = std::min(kBaseGridNum, Mnext);
-      const int maxLoopPerTB = 64;
-      int maxMblock = blockDim.x*maxLoopPerTB;
-      Mnext = (M + maxMblock - 1) / maxMblock;
+template<int ndim, typename DType>
+ReduceImplConfig<ndim> ConfigureReduceImpl(const TBlob& small, const TBlob& big, const TBlob* lhs,
+  const TBlob* rhs) {
+
+  ReduceImplConfig<ndim> config;
+
+  diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &config.rshape, &config.rstride);
+  config.N = small.shape_.Size();
+  config.M = config.rshape.Size();
+
+  bool multiOp = false;
+  if (lhs != NULL) {
+    CHECK_NOTNULL(rhs);
+    diff(small.shape_.get<ndim>(), lhs->shape_.get<ndim>(), &config.lhs_shape,
+      &config.lhs_stride);
+    diff(small.shape_.get<ndim>(), rhs->shape_.get<ndim>(), &config.rhs_shape,
+      &config.rhs_stride);
+    multiOp = true;
+  }
+
+  config.workspace_size = 0;
+
+  if (config.M == 1) {
+    config.kernel_1.blockDim.x = kMaxThreadsPerBlock;
+    config.kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+      (config.N + config.kernel_1.blockDim.x - 1)/config.kernel_1.blockDim.x);
+  } else {
+
+    int reduce_strides[3];
+    reduce_strides[0] = fastest_stride(small.shape_.get<ndim>(), big.shape_.get<ndim>(),
+      big.shape_.get<ndim>());
+    reduce_strides[1] = (multiOp) ? fastest_stride(small.shape_.get<ndim>(),
+      lhs->shape_.get<ndim>(), lhs->shape_.get<ndim>()) : 1;
+    reduce_strides[2] = (multiOp) ? fastest_stride(small.shape_.get<ndim>(),
+      rhs->shape_.get<ndim>(), rhs->shape_.get<ndim>()) : 1;
+
+    int reduce_strides_transp[3];
+    reduce_strides_transp[0] = fastest_stride(small.shape_.get<ndim>(), config.rshape,
+      config.rstride);
+    reduce_strides_transp[1] = (multiOp) ?
+      fastest_stride(small.shape_.get<ndim>(), config.lhs_shape, config.lhs_stride) : 1;
+    reduce_strides_transp[2] = (multiOp) ?
+      fastest_stride(small.shape_.get<ndim>(), config.rhs_shape, config.rhs_stride) : 1;
+
+    uint64_t num_load = calc_num_load(config.N, config.M, reduce_strides);
+    uint64_t num_load_transp = calc_num_load(config.M, config.N, reduce_strides_transp);
+
+    config.Mnext = 1;
+    config.kernel_1.do_transpose = (num_load > num_load_transp);
+
+    config.kernel_1.blockDim.x = 0;
+    config.kernel_1.blockDim.y = 0;
+
+    if (config.kernel_1.do_transpose) {
+      // Fastest thread ID goes through M
+      // Loop over N has step size config.kernel_1.blockDim.y
+      if (config.N < 8) {
+        config.kernel_1.blockDim.y = 1;
+      } else if (config.N < 256) {
+        config.kernel_1.blockDim.y = 4;
+      } else {
+        if (config.M < 8) {
+          config.kernel_1.blockDim.x = 1;
+        } else if (config.M < 256) {
+          config.kernel_1.blockDim.x = 4;
+        } else {
+          config.kernel_1.blockDim.x = config.warpSize;
+        }
+      }
     } else {
-      const int maxLoopPerTB = 64;
-      if (M >= maxLoopPerTB*32) {
-        // M is large enough, choose square thread block
-        blockDim.y = std::min(M, nthread_reduce/warpSize);
-      } else if (M > 40) {
-        // M is medium, choose rectangular thread block
-        blockDim.y = 4;
+      // Fastest thread ID goes through N
+      // Loop over M has step size config.kernel_1.blockDim.y
+      if (config.M < 8) {
+        config.kernel_1.blockDim.y = 1;
+      } else if (config.M < 256) {
+        config.kernel_1.blockDim.y = 4;
       } else {
-        // M is small, choose flat thread block
-        blockDim.y = 1;
+        if (config.N < 8) {
+          config.kernel_1.blockDim.x = 1;
+        } else if (config.N < 256) {
+          config.kernel_1.blockDim.x = 4;
+        } else {
+          config.kernel_1.blockDim.x = config.warpSize;
+        }
       }
-      blockDim.x = (nthread_reduce/(blockDim.y*warpSize))*warpSize;
-      gridDim.x = std::min((unsigned int)kBaseGridNum, (N + blockDim.x - 1)/blockDim.x);
-      gridDim.y = std::min(kBaseGridNum, Mnext);
+    }
+
+    if (config.kernel_1.blockDim.x == 0 && config.kernel_1.blockDim.y == 0) {
+      LOG(FATAL) << "Unable to set blockDim";
+    } else if (config.kernel_1.blockDim.x == 0) {
+      config.kernel_1.blockDim.x = nthread_reduce / config.kernel_1.blockDim.y;
+    } else if (config.kernel_1.blockDim.y == 0) {
+      config.kernel_1.blockDim.y = nthread_reduce / config.kernel_1.blockDim.x;
+    }
+
+    if (config.kernel_1.do_transpose) {
+      // Fastest thread ID goes through M
+      config.kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+        ceil_idiv<unsigned int>(config.N, config.kernel_1.blockDim.y));
+      config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
+      int by = config.kernel_1.blockDim.y;
+      if (config.kernel_1.blockDim.y % config.warpSize == 0) {
+        // Fix shared memory bank conflict
+        by++;
+      }
+      config.kernel_1.shMemSize = (config.kernel_1.blockDim.x > 1) ?
+        config.kernel_1.blockDim.x*by*sizeof(DType) : 0;
+      // Maximum number of times we want TB to loop in M
+      // Max size of M-block each TB can handle
+      int maxMblock = config.kernel_1.blockDim.x*config.maxLoopPerTB;
+      config.Mnext = (config.M + maxMblock - 1) / maxMblock;
+    } else {
+      // Fastest thread ID goes through N
+      config.kernel_1.gridDim.x = std::min((unsigned int)kBaseGridNum,
+        ceil_idiv<unsigned int>(config.N, config.kernel_1.blockDim.x));
+      config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
+      config.kernel_1.shMemSize = (config.kernel_1.blockDim.y > 1) ?
+        config.kernel_1.blockDim.x*config.kernel_1.blockDim.y*sizeof(DType) : 0;
       // Maximum number of times we want TB to loop in M
       // Max size of M-block each TB can handle
-      int maxMblock = blockDim.y*maxLoopPerTB;
-      Mnext = (M + maxMblock - 1) / maxMblock;
+      int maxMblock = config.kernel_1.blockDim.y*config.maxLoopPerTB;
+      config.Mnext = (config.M + maxMblock - 1) / maxMblock;
+    }
+
+    if (config.Mnext > 1) {
+      // small_dptr[] is N*Mnext*sizeof(DType) bytes
+      config.workspace_size += config.N*config.Mnext*sizeof(DType);
+      // Set gridDim.y to Mnext
+      config.kernel_1.gridDim.y = std::min(kBaseGridNum, config.Mnext);
     }
 
-    DType* out1_dptr = small.dptr<DType>();
+    if (config.Mnext > 1) {
+      config.kernel_2.blockSize = kMaxThreadsPerBlock;
+      config.kernel_2.gridSize = std::min((int)kBaseGridNum,
+        (config.N + config.kernel_2.blockSize - 1)/config.kernel_2.blockSize );
+    }
+
+  }
+
+  return config;
+}
+
+#define KERNEL_UNROLL_SWITCH(do_unroll, unrollAmount, unrollVar, ...) \
+  if (do_unroll) {                                                    \
+    const int unrollVar = unrollAmount;                               \
+    {__VA_ARGS__}                                                     \
+  } else {                                                            \
+    const int unrollVar = 1;                                          \
+    {__VA_ARGS__}                                                     \
+  }
+
+template<typename Reducer, int ndim, typename DType, typename OP>
+void ReduceImpl(cudaStream_t stream, const TBlob& small, const OpReqType req,
+                const TBlob& big, const Tensor<gpu, 1, char>& workspace,
+                const ReduceImplConfig<ndim>& config) {
+  if (config.M == 1) {
+    reduce_kernel_M1<Reducer, ndim, DType, OP>
+    <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
+      config.N, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<ndim>(),
+      small.shape_.get<ndim>());
+  } else {
+
+    DType* small_dptr = small.dptr<DType>();
     bool addto = (req == kAddTo);
-    if (Mnext > 1) {
-      // out1_dptr[] is N*Mnext*sizeof(DType) bytes
-      out1_dptr = reinterpret_cast<DType*>(workspace.dptr_ + workspace_pos);
-      workspace_pos += N*Mnext*sizeof(DType);
+    if (config.Mnext > 1) {
+      // small_dptr[] is N*Mnext*sizeof(DType) bytes
+      small_dptr = reinterpret_cast<DType*>(workspace.dptr_);
       addto = false;
       // Check that the workspace is contigiuous
-      if (!getWorkspaceSize) CHECK_EQ(workspace.CheckContiguous(), true);
+      CHECK_EQ(workspace.CheckContiguous(), true);
       // Check that we have enough storage
-      if (!getWorkspaceSize) CHECK_GE(workspace.size(0), workspace_pos);
-      // Set gridDim.y to Mnext
-      gridDim.y = std::min(kBaseGridNum, Mnext);
+      CHECK_GE(workspace.size(0), config.workspace_size);
     }
 
-    if (N <= par_reduce_lim) {
-      if (!getWorkspaceSize) {
-        if ( M / (blockDim.x*Mnext) >= unroll_par_reduce ) {
-          par_reduce_kernel<Reducer, DType, OP, kMaxThreadBits, unroll_par_reduce>
-          <<< gridDim, blockDim, 0, stream>>>(
-            N, M, addto, big.dptr<DType>(), out1_dptr, big.shape_.get<MAX_DIM>(),
-            small.shape_.get<MAX_DIM>(), rshape, rstride, Mnext);
-        } else {
-          par_reduce_kernel<Reducer, DType, OP, kMaxThreadBits, 1>
-          <<< gridDim, blockDim, 0, stream>>>(
-            N, M, addto, big.dptr<DType>(), out1_dptr, big.shape_.get<MAX_DIM>(),
-            small.shape_.get<MAX_DIM>(), rshape, rstride, Mnext);          
-        }
-      }
-    } else {
-      if (!getWorkspaceSize) {
-        int shMemSize = blockDim.x*( (blockDim.y > 1) ? blockDim.y : 0 )*sizeof(DType);
-        if ( M / (blockDim.y*Mnext) >= unroll_reduce ) {
-          reduce_kernel<Reducer, DType, OP, unroll_reduce>
-          <<< gridDim, blockDim, shMemSize, stream>>>(
-            N, M, addto, big.dptr<DType>(), out1_dptr, big.shape_.get<MAX_DIM>(),
-            small.shape_.get<MAX_DIM>(), rshape, rstride, Mnext);
-        } else {
-          reduce_kernel<Reducer, DType, OP, 1>
-          <<< gridDim, blockDim, shMemSize, stream>>>(
-            N, M, addto, big.dptr<DType>(), out1_dptr, big.shape_.get<MAX_DIM>(),
-            small.shape_.get<MAX_DIM>(), rshape, rstride, Mnext);          
-        }
-      }
+    const int by = (config.kernel_1.do_transpose) ?
+      config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
+    const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
+    KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
+      reduce_kernel<Reducer, ndim, DType, OP, UNROLL>
+      <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
+        config.N, config.M, addto, big.dptr<DType>(), small_dptr, big.shape_.get<ndim>(),
+        small.shape_.get<ndim>(), config.rshape, config.rstride, config.Mnext,
+        config.kernel_1.do_transpose);
+    });
+
+    if (config.Mnext > 1) {
+      reduce_lines_kernel<Reducer, DType>
+      <<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
+        (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<DType>());
     }
+  }
+}
 
-    if (Mnext > 1) {
-      if (!getWorkspaceSize) {
-        int blockSize = kMaxThreadsPerBlock;
-        int gridSize = std::min((int)kBaseGridNum, (N + blockSize - 1)/blockSize );
-        reduce_lines_kernel<Reducer, DType, OP><<< gridSize, blockSize, 0, stream >>>
-        (N, Mnext, req == kAddTo, N, out1_dptr, small.dptr<DType>());
-      }
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
+void ReduceImpl(cudaStream_t stream, const TBlob& small, const TBlob& lhs, const TBlob& rhs,
+                const OpReqType req, const TBlob& big, const Tensor<gpu, 1, char>& workspace,
+                const ReduceImplConfig<ndim>& config) {
+  if (config.M == 1) {
+    reduce_kernel_M1<Reducer, ndim, DType, OP1, OP2>
+    <<< config.kernel_1.gridDim, config.kernel_1.blockDim, 0, stream >>>(
+      config.N, req == kAddTo, big.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>(),
+      small.dptr<DType>(), big.shape_.get<ndim>(), lhs.shape_.get<ndim>(),
+      rhs.shape_.get<ndim>(), small.shape_.get<ndim>());
+  } else {
+    DType* small_dptr = small.dptr<DType>();
+    bool addto = (req == kAddTo);
+    if (config.Mnext > 1) {
+      // small_dptr[] is N*Mnext*sizeof(DType) bytes
+      small_dptr = reinterpret_cast<DType*>(workspace.dptr_);
+      addto = false;
+      // Check that the workspace is contigiuous
+      CHECK_EQ(workspace.CheckContiguous(), true);
+      // Check that we have enough storage
+      CHECK_GE(workspace.size(0), config.workspace_size);
     }
 
+    const int by = (config.kernel_1.do_transpose) ?
+      config.kernel_1.blockDim.x : config.kernel_1.blockDim.y;
+    const bool do_unroll = ( config.M / (by*config.Mnext) >= config.unroll_reduce );
+    KERNEL_UNROLL_SWITCH(do_unroll, ReduceImplConfig<ndim>::unroll_reduce, UNROLL, {
+      reduce_kernel<Reducer, ndim, DType, OP1, OP2, UNROLL>
+      <<< config.kernel_1.gridDim, config.kernel_1.blockDim, config.kernel_1.shMemSize, stream>>>(
+        config.N, config.M, addto, big.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>(),
+        small_dptr, big.shape_.get<ndim>(), lhs.shape_.get<ndim>(),
+        rhs.shape_.get<ndim>(), small.shape_.get<ndim>(), config.rshape, config.lhs_shape,
+        config.rhs_shape, config.rstride, config.lhs_stride, config.rhs_stride, config.Mnext,
+        config.kernel_1.do_transpose);
+    });
+
+    if (config.Mnext > 1) {
+      reduce_lines_kernel<Reducer, DType>
+      <<< config.kernel_2.gridSize, config.kernel_2.blockSize, 0, stream >>>
+        (config.N, config.Mnext, req == kAddTo, config.N, small_dptr, small.dptr<DType>());
+    }
   }
+}
 
-  return workspace_pos;
+#undef KERNEL_UNROLL_SWITCH
+
+template<typename Reducer, int ndim, typename DType, typename OP>
+void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
+            const Tensor<gpu, 1, char>& workspace, const TBlob& big) {
+  if (req == kNullOp) return;
+  cudaStream_t stream = Stream<gpu>::GetStream(s);
+  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, NULL, NULL);
+  ReduceImpl<Reducer, ndim, DType, OP>(stream, small, req, big, workspace, config);
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
 void Reduce(Stream<gpu> *s, const TBlob& small, const OpReqType req,
-            const TBlob& big, const Tensor<gpu, 1, char>& workspace) {
-  ReduceImpl<Reducer, DType, OP>(s, small, req, big, workspace, false);
+            const Tensor<gpu, 1, char>& workspace, const TBlob& big,
+            const TBlob& lhs, const TBlob& rhs) {
+  if (req == kNullOp) return;
+  cudaStream_t stream = Stream<gpu>::GetStream(s);
+  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, &lhs, &rhs);
+  ReduceImpl<Reducer, ndim, DType, OP1, OP2>(stream, small, lhs, rhs, req, big, workspace, config);
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<int ndim, typename DType>
 size_t ReduceWorkspaceSize(Stream<gpu> *s, const TBlob& small, const OpReqType req,
                            const TBlob& big) {
-  mshadow::Tensor<gpu, 1, char> dummy_workspace;
-  return ReduceImpl<Reducer, DType, OP>(s, small, req, big, dummy_workspace, true);
+  if (req == kNullOp) return 0;
+  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, NULL, NULL);
+  return config.workspace_size;
+}
+
+template<int ndim, typename DType>
+size_t ReduceWorkspaceSize(Stream<gpu> *s, const TBlob& small, const OpReqType req,
+                           const TBlob& big, const TBlob& lhs, const TBlob& rhs) {
+  if (req == kNullOp) return 0;
+  ReduceImplConfig<ndim> config = ConfigureReduceImpl<ndim, DType>(small, big, &lhs, &rhs);
+  return config.workspace_size;
 }
 
 #endif  //MXNET_OPERATOR_TENSOR_BROADCAST_REDUCE_INL_CUH_
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 144789239d27..84d420bc865f 100755
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -22,7 +22,6 @@ namespace broadcast {
 using namespace mshadow;
 
 const int MAX_DIM = 5;
-using CShape = Shape<MAX_DIM>;
 
 template<int ndim>
 MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
@@ -54,6 +53,7 @@ MSHADOW_XINLINE void unravel_dot(const int idx, const Shape<ndim>& shape,
 template<int ndim>
 MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
   Shape<ndim> ret;
+  #pragma unroll
   for (int i = ndim-1, j = idx; i >=0; --i) {
     int tmp = j / shape[i];
     ret[i] = j - tmp*shape[i];
@@ -65,6 +65,7 @@ MSHADOW_XINLINE Shape<ndim> unravel(const int idx, const Shape<ndim>& shape) {
 template<int ndim>
 MSHADOW_XINLINE int ravel(const Shape<ndim>& coord, const Shape<ndim>& shape) {
   int ret = 0;
+  #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     ret = ret * shape[i] + (shape[i] > 1) * coord[i];
   }
@@ -75,10 +76,12 @@ template<int ndim>
 MSHADOW_XINLINE int diff(const Shape<ndim>& small, const Shape<ndim>& big, Shape<ndim>* dims,
   Shape<ndim>* stride) {
   int mdim = 0;
+  #pragma unroll
   for (int i = 0; i < ndim; ++i) {
     mdim += small[i] != big[i];
     (*dims)[i] = (*stride)[i] = 1;
   }
+  #pragma unroll
   for (int i = ndim-1, j = mdim, s = 1; i >= 0; --i) {
     if (small[i] != big[i]) {
       --j;
@@ -94,6 +97,7 @@ template<int ndim>
 MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
   const Shape<ndim>& stride) {
   int ret = 0;
+  #pragma unroll
   for (int i = ndim-1, j = idx; i >=0; --i) {
     int tmp = j / shape[i];
     ret += (j - tmp*shape[i])*stride[i];
@@ -105,6 +109,7 @@ MSHADOW_XINLINE int unravel_dot(const int idx, const Shape<ndim>& shape,
 template<int ndim>
 MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
   int ret = 0;
+  #pragma unroll
   for (int i = 0; i < ndim; ++i)
     ret += coord[i] * stride[i];
   return ret;
@@ -119,24 +124,24 @@ MSHADOW_XINLINE void assign(DType* dst, const bool addto, const DType src) {
   }
 }
 
-template<typename DType, typename OP>
+template<int ndim, typename DType, typename OP>
 MSHADOW_XINLINE void binary_broadcast_assign(const int idx, const bool addto,
                                              const DType* __restrict lhs,
                                              const DType* __restrict rhs, DType* out,
-                                             const CShape& lshape, const CShape& rshape,
-                                             const CShape& oshape) {
-  const CShape coord = unravel(idx, oshape);
+                                             const Shape<ndim>& lshape, const Shape<ndim>& rshape,
+                                             const Shape<ndim>& oshape) {
+  const Shape<ndim> coord = unravel(idx, oshape);
   const int j = ravel(coord, lshape);
   const int k = ravel(coord, rshape);
   assign(&out[idx], addto, OP::Map(lhs[j], rhs[k]));
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP>
 MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
                                        const DType* __restrict big, DType *small,
-                                       const CShape& bshape, const CShape& sshape,
-                                       const CShape& rshape, const CShape& rstride) {
-  CShape coord = unravel(idx, sshape);
+                                       const Shape<ndim>& bshape, const Shape<ndim>& sshape,
+                                       const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
+  Shape<ndim> coord = unravel(idx, sshape);
   int j = ravel(coord, bshape);
   DType val;
   Reducer::SetInitValue(val);
@@ -149,58 +154,135 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
 
 #ifdef __CUDACC__
 #include "broadcast_reduce-inl.cuh"
-#else
 
-template<typename DType>
-using CTensor = Tensor<cpu, MAX_DIM, DType>;
+#else
 
-template<typename DType, typename OP>
+template<int ndim, typename DType, typename OP>
 void binary_broadcast_compute(const int N, const bool addto, const DType *lhs,
-                              const DType *rhs, DType *out, const CShape lshape,
-                              const CShape rshape, const CShape oshape) {
+                              const DType *rhs, DType *out, const Shape<ndim> lshape,
+                              const Shape<ndim> rshape, const Shape<ndim> oshape) {
   for (int idx = 0; idx < N; ++idx) {
-    binary_broadcast_assign<DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
+    binary_broadcast_assign<ndim, DType, OP>(idx, addto, lhs, rhs, out, lshape, rshape, oshape);
   }
 }
 
-template<typename DType, typename OP>
+template<int ndim, typename DType, typename OP>
 void BinaryBroadcastComputeImpl(Stream<cpu> *s, const OpReqType req,
                                 const TBlob& lhs, const TBlob& rhs, const TBlob& out) {
   if (req == kNullOp) return;
   int N = out.shape_.Size();
-  binary_broadcast_compute<DType, OP>(N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(),
-                           out.dptr<DType>(), lhs.shape_.get<MAX_DIM>(), rhs.shape_.get<MAX_DIM>(),
-                           out.shape_.get<MAX_DIM>());
+  binary_broadcast_compute<ndim, DType, OP>(N, req == kAddTo, lhs.dptr<DType>(), rhs.dptr<DType>(),
+                           out.dptr<DType>(), lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>(),
+                           out.shape_.get<ndim>());
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP>
 void seq_reduce_compute(const int N, const int M, const bool addto,
-                        const DType *big, DType *small, const CShape bshape, const CShape sshape,
-                        const CShape rshape, const CShape rstride) {
+                        const DType *big, DType *small, const Shape<ndim> bshape,
+                        const Shape<ndim> sshape, const Shape<ndim> rshape,
+                        const Shape<ndim> rstride) {
   for (int idx = 0; idx < N; ++idx) {
-    seq_reduce_assign<Reducer, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
+    seq_reduce_assign<Reducer, ndim, DType, OP>(idx, M, addto, big, small, bshape, sshape, rshape,
       rstride);
   }
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<typename Reducer, int ndim, typename DType, typename OP>
 void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
-            const TBlob& big, const Tensor<cpu, 1, char>& workspace) {
+            const Tensor<cpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
-  CShape rshape, rstride;
-  int mdim = diff(small.shape_.get<MAX_DIM>(), big.shape_.get<MAX_DIM>(), &rshape, &rstride);
+  Shape<ndim> rshape, rstride;
+  int mdim = diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   int N = small.shape_.Size(), M = rshape.Size();
-  seq_reduce_compute<Reducer, DType, OP>(
-    N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<MAX_DIM>(),
-    small.shape_.get<MAX_DIM>(), rshape, rstride);
+  seq_reduce_compute<Reducer, ndim, DType, OP>(
+    N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<ndim>(),
+    small.shape_.get<ndim>(), rshape, rstride);
 }
 
-template<typename Reducer, typename DType, typename OP>
+template<int ndim, typename DType>
 size_t ReduceWorkspaceSize(Stream<cpu> *s, const TBlob& small, const OpReqType req,
                            const TBlob& big) {
   return 0;
 }
 
+template<int ndim, typename DType>
+size_t ReduceWorkspaceSize(Stream<cpu> *s, const TBlob& small, const OpReqType req,
+                           const TBlob& big, const TBlob& lhs, const TBlob& rhs) {
+  return 0;
+}
+
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
+MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool addto,
+                                       const DType* __restrict big, const DType* __restrict lhs,
+                                       const DType* __restrict rhs, DType *small,
+                                       const Shape<ndim>& big_shape, const Shape<ndim>& lhs_shape0,
+                                       const Shape<ndim>& rhs_shape0,
+                                       const Shape<ndim>& small_shape, const Shape<ndim>& rshape,
+                                       const Shape<ndim>& lhs_shape, const Shape<ndim>& rhs_shape,
+                                       const Shape<ndim>& rstride, const Shape<ndim>& lhs_stride,
+                                       const Shape<ndim>& rhs_stride) {
+  Shape<ndim> coord = unravel(idx, small_shape);
+  const int idx_big0 = ravel(coord, big_shape);
+  const int idx_lhs0 = ravel(coord, lhs_shape0);
+  const int idx_rhs0 = ravel(coord, rhs_shape0);
+  DType val;
+  Reducer::SetInitValue(val);
+  for (int k = 0; k < M; ++k) {
+    Shape<ndim> coord_big = unravel(k, rshape);
+    int idx_big = idx_big0 + dot(coord_big, rstride);
+
+    Shape<ndim> coord_lhs = unravel(k, lhs_shape);
+    int idx_lhs = idx_lhs0 + dot(coord_lhs, lhs_stride);
+
+    Shape<ndim> coord_rhs = unravel(k, rhs_shape);
+    int idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
+
+    Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs]) ) );
+  }
+  assign(&small[idx], addto, val);
+}
+
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
+void seq_reduce_compute(const int N, const int M, const bool addto,
+                        const DType *big, const DType *lhs, const DType *rhs, DType *small,
+                        const Shape<ndim> big_shape, const Shape<ndim> small_shape,
+                        const Shape<ndim> rshape, const Shape<ndim> rstride,
+                        const Shape<ndim> lhs_shape, const Shape<ndim> lhs_stride,
+                        const Shape<ndim> rhs_shape, const Shape<ndim> rhs_stride,
+                        const Shape<ndim>& lhs_shape0, const Shape<ndim>& rhs_shape0) {
+  for (int idx = 0; idx < N; ++idx) {
+    seq_reduce_assign<Reducer, ndim, DType, OP1, OP2>(idx, M, addto, big, lhs, rhs, small,
+      big_shape, lhs_shape0, rhs_shape0, small_shape, rshape, lhs_shape, rhs_shape, rstride,
+      lhs_stride, rhs_stride);
+  }
+}
+
+template<typename Reducer, int ndim, typename DType, typename OP1, typename OP2>
+void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
+            const Tensor<cpu, 1, char>& workspace, const TBlob& big, const TBlob& lhs,
+            const TBlob& rhs) {
+  if (req == kNullOp) return;
+  Shape<ndim> rshape, rstride;
+  diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
+  int N = small.shape_.Size();
+  int M = rshape.Size();
+
+  Shape<ndim> lhs_shape, lhs_stride;
+  diff(small.shape_.get<ndim>(), lhs.shape_.get<ndim>(), &lhs_shape, &lhs_stride);
+
+  Shape<ndim> rhs_shape, rhs_stride;
+  diff(small.shape_.get<ndim>(), rhs.shape_.get<ndim>(), &rhs_shape, &rhs_stride);
+
+  seq_reduce_compute<Reducer, ndim, DType, OP1, OP2>(
+    N, M, req == kAddTo,
+    big.dptr<DType>(), lhs.dptr<DType>(), rhs.dptr<DType>(), small.dptr<DType>(),
+    big.shape_.get<ndim>(), small.shape_.get<ndim>(),
+    rshape, rstride,
+    lhs_shape, lhs_stride,
+    rhs_shape, rhs_stride,
+    lhs.shape_.get<ndim>(), rhs.shape_.get<ndim>());
+}
+
 #endif
 }  // namespace broadcast
 }  // namespace op
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index e0320decca65..d40776369845 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -13,6 +13,7 @@
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "./elemwise_binary_broadcast_op.h"
+#include "../mxnet_op.h"
 
 namespace mxnet {
 namespace op {
@@ -21,21 +22,21 @@ struct ReduceAxesParam : public dmlc::Parameter<ReduceAxesParam> {
   bool keepdims;
   DMLC_DECLARE_PARAMETER(ReduceAxesParam) {
     DMLC_DECLARE_FIELD(axis).set_default(TShape())
-      .describe("Empty or unsigned or tuple. The axes to perform the reduction."
-                "If left empty, a global reduction will be performed.");
+        .describe("The axes to perform the reduction.");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
-      .describe("If true, the axis which is reduced is left "
+      .describe("If true, the axes which are reduced are left "
                 "in the result as dimension with size one.");
   }
 };
 
 struct ReduceAxisParam : public dmlc::Parameter<ReduceAxisParam> {
-  int axis;
+  dmlc::optional<int> axis;
   bool keepdims;
   DMLC_DECLARE_PARAMETER(ReduceAxisParam) {
-    DMLC_DECLARE_FIELD(axis).set_default(-1)
-      .describe("Empty or unsigned. The axis to perform the reduction."
-                "If left empty, a global reduction will be performed.");
+    DMLC_DECLARE_FIELD(axis).set_default(dmlc::optional<int>())
+      .describe("int or None. The axis to perform the reduction. "
+                "Negative values means indexing from right to left. "
+                "If is `None`, a global reduction will be performed.");
     DMLC_DECLARE_FIELD(keepdims).set_default(false)
       .describe("If true, the axis which is reduced is left "
                 "in the result as dimension with size one.");
@@ -64,45 +65,71 @@ struct BroadcastToParam : public dmlc::Parameter<BroadcastToParam> {
   }
 };
 
-inline bool ReduceAxisShape(const nnvm::NodeAttrs& attrs,
-                            std::vector<TShape> *in_attrs,
-                            std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  TShape& ishape = (*in_attrs)[0];
-  if (ishape.ndim() == 0) return false;
-  const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
-  if (param.axis == -1 || ishape.ndim() == 1) {
+inline int CheckAxis(int axis, int ndim) {
+  CHECK(axis < ndim && axis >= -ndim)
+    << "axis " << axis << " exceeds input dimension of " << ndim;
+  return (axis + ndim)%ndim;
+}
+
+inline TShape AxisShapeCompact(TShape shape, int *axis, bool allow_2d) {
+  int ndim = static_cast<int>(shape.ndim());
+  index_t leading = 1, trailing = 1, M = shape[*axis];
+  for (int i = 0; i < *axis; ++i) leading *= shape[i];
+  for (int i = *axis + 1; i < ndim; ++i) trailing *= shape[i];
+  if (allow_2d && trailing == 1) {
+    *axis = 1;
+    return mshadow::Shape2(leading, M);
+  }
+  if (allow_2d && leading == 1) {
+    *axis = 0;
+    return mshadow::Shape2(M, trailing);
+  }
+  *axis = 1;
+  return mshadow::Shape3(leading, M, trailing);
+}
+
+inline TShape ReduceAxisShapeImpl(const ReduceAxisParam& param, const TShape& ishape) {
+  if (!param.axis || ishape.ndim() == 1) {
     if (param.keepdims) {
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape(ishape.ndim()));
+      return TShape(ishape.ndim());
     } else {
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
+      return mshadow::Shape1(1);
     }
   } else {
-    CHECK_LT(param.axis, static_cast<int>(ishape.ndim()))
-        << "Reduction axis " << param.axis
-        << " Exceeds input dimensions " << ishape;
+    int axis = CheckAxis(param.axis.value(), ishape.ndim());
     if (param.keepdims) {
       TShape oshape = ishape;
-      oshape[param.axis] = 1;
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+      oshape[axis] = 1;
+      return oshape;
     } else {
       TShape oshape(ishape.ndim() - 1);
-      for (int i = 0; i < param.axis; ++i) oshape[i] = ishape[i];
-      for (int i = param.axis+1; i < static_cast<int>(ishape.ndim()); ++i) {
+      for (int i = 0; i < axis; ++i) oshape[i] = ishape[i];
+      for (int i = axis+1; i < static_cast<int>(ishape.ndim()); ++i) {
         oshape[i-1] = ishape[i];
       }
-      SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+      return oshape;
     }
   }
-  return true;
 }
 
-inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
+inline bool ReduceAxisShape(const nnvm::NodeAttrs& attrs,
                             std::vector<TShape> *in_attrs,
                             std::vector<TShape> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
+  TShape& ishape = (*in_attrs)[0];
+  if (ishape.ndim() == 0) return false;
+
+  const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, ReduceAxisShapeImpl(param, ishape));
+  return true;
+}
+
+inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
+                            std::vector<TShape> *in_attrs,
+                            std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   if ((*in_attrs)[0].ndim() == 0) return false;
   const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
   TShape &ishape = (*in_attrs)[0];
@@ -140,15 +167,15 @@ inline bool ReduceAxesShape(const nnvm::NodeAttrs& attrs,
 inline bool BroadcastAxesShape(const nnvm::NodeAttrs& attrs,
                                std::vector<TShape> *in_attrs,
                                std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   if ((*in_attrs)[0].ndim() == 0) return false;
   const BroadcastAxesParam& param = nnvm::get<BroadcastAxesParam>(attrs.parsed);
   CHECK_EQ(param.axis.ndim() , param.size.ndim());
   TShape &ishape = (*in_attrs)[0];
   TShape oshape = ishape;
   for (index_t i = 0; i < param.axis.ndim(); ++i) {
-    CHECK_EQ(oshape[param.axis[i]], 1) << "Broadcasting axis must have size 1";
+    CHECK_EQ(oshape[param.axis[i]], 1U) << "Broadcasting axis must have size 1";
     oshape[param.axis[i]] = param.size[i];
   }
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
@@ -158,18 +185,23 @@ inline bool BroadcastAxesShape(const nnvm::NodeAttrs& attrs,
 inline bool BroadcastToShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                             std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TShape& ishape = (*in_attrs)[0];
   if (ishape.ndim() == 0) return false;
   const BroadcastToParam& param = nnvm::get<BroadcastToParam>(attrs.parsed);
   CHECK_EQ(ishape.ndim(), param.shape.ndim())
     << "Operand of shape " << ishape << " cannot be broadcasted to " << param.shape;
+  TShape oshape = param.shape;
   for (index_t i = 0; i < ishape.ndim(); ++i) {
-    CHECK(ishape[i] == param.shape[i] || ishape[i] == 1)
-      << "Broadcasting axis must have size 1";
+    if (oshape[i] != 0) {
+      CHECK(ishape[i] == oshape[i] || ishape[i] == 1)
+        << "Array cannot be broadcasted from " << ishape << " to " << param.shape;
+    } else {
+      oshape[i] = ishape[i];
+    }
   }
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   return true;
 }
 
@@ -224,23 +256,18 @@ void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (param.axis == -1) {
-    LOG(FATAL) << "Global reduction not supported yet";
-  } else {
-    index_t leading = 1, trailing = 1;
-    for (int i = 0; i < param.axis; ++i)
-      leading *= inputs[0].shape_[i];
-    for (int i = param.axis+1; i < inputs[0].ndim(); ++i)
-      trailing *= inputs[0].shape_[i];
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      Tensor<xpu, 2, DType> out = outputs[0].get_with_shape<xpu, 2, DType>(
-        Shape2(leading, trailing), s);
-      Tensor<xpu, 3, DType> in = inputs[0].get_with_shape<xpu, 3, DType>(
-        Shape3(leading, inputs[0].shape_[param.axis], trailing), s);
-      CHECK(req[0] != kAddTo) << "AddTo is not supported";
-      ASSIGN_DISPATCH(out, req[0], (reduce_with_axis<reducer, true>(in, 1)));
-    });
-  }
+  if (!param.axis) LOG(FATAL) << "Global reduction not supported yet";
+
+  int axis = CheckAxis(param.axis.value(), inputs[0].shape_.ndim());
+  TShape shape = AxisShapeCompact(inputs[0].shape_, &axis, false);
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> out = outputs[0].get_with_shape<xpu, 2, DType>(
+      Shape2(shape[0], shape[2]), s);
+    Tensor<xpu, 3, DType> in = inputs[0].get_with_shape<xpu, 3, DType>(
+      shape.get<3>(), s);
+    CHECK(req[0] != kAddTo) << "AddTo is not supported";
+    ASSIGN_DISPATCH(out, req[0], (reduce_with_axis<reducer, true>(in, 1)));
+  });
 }
 
 template<typename xpu, typename reducer, bool normalize = false>
@@ -431,9 +458,9 @@ struct ReduceGrad {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
                                           const std::vector<nnvm::NodeEntry>& ograds) {
-    return MakeGradNode(
+    return MakeNonlossGradNode(
         op_name, n,
-        {ograds[0], n->inputs[0], nnvm::NodeEntry{n, 0, 0}},
+        ograds, {n->inputs[0], nnvm::NodeEntry{n, 0, 0}},
         n->attrs.dict);
   }
 };
@@ -454,6 +481,137 @@ void L2NormCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
+/*! \brief index element from array along axes */
+template<int ndim>
+struct pick {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* a,
+                                  const IType *idx, int M, int stride,
+                                  mshadow::Shape<ndim> bshape,
+                                  mshadow::Shape<ndim> sshape) {
+    using namespace broadcast;
+    int j = static_cast<int>(idx[i]);
+    if (j < 0) j = 0;
+    else if (j >= M) j = M-1;
+    j = ravel(unravel(i, sshape), bshape) + j*stride;
+    out[i] = a[j];
+  }
+};
+
+/*! \brief index element from array along axes */
+template<int ndim>
+struct pick_grad {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* igrad, const DType* ograd,
+                                  const IType *idx, int M, int stride,
+                                  mshadow::Shape<ndim> bshape,
+                                  mshadow::Shape<ndim> sshape) {
+    using namespace broadcast;
+    int j = static_cast<int>(idx[i]);
+    if (j < 0) j = 0;
+    else if (j >= M) j = M-1;
+    j = ravel(unravel(i, sshape), bshape) + j*stride;
+    igrad[j] += ograd[i];
+  }
+};
+
+inline bool PickOpShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape> *in_attrs,
+                        std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2);
+  CHECK_EQ(out_attrs->size(), 1);
+  const TShape& ishape = (*in_attrs)[0];
+  if (ishape.ndim() == 0) return false;
+  const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
+  if (!param.axis) LOG(FATAL)
+    << "axis=None is not supported by pick yet. Must specify an axis.";
+
+  TShape oshape = ReduceAxisShapeImpl(param, ishape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, oshape);
+  return true;
+}
+
+inline bool PickOpType(const nnvm::NodeAttrs& attrs,
+                       std::vector<int> *in_attrs,
+                       std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE((*in_attrs)[1], -1) << "Index type must be set for pick operator";
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  return (*out_attrs)[0] != -1;
+}
+
+template<typename xpu>
+void PickOpForward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(req[0], kWriteTo);
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
+
+  const TShape& ishape = inputs[0].shape_;
+  int axis = CheckAxis(param.axis.value(), ishape.ndim());
+  int leading = 1, trailing = 1, M = ishape[axis];
+  for (index_t i = 0; i < axis; ++i) leading *= ishape[i];
+  for (index_t i = axis+1; i < ishape.ndim(); ++i) trailing *= ishape[i];
+
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output type
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index type
+      if (trailing == 1) {
+        Kernel<pick<2>, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
+                                     inputs[0].dptr<DType>(), inputs[1].dptr<IType>(),
+                                     M, 1, Shape2(leading, M), Shape2(leading, 1));
+      } else {
+        Kernel<pick<3>, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
+                                     inputs[0].dptr<DType>(), inputs[1].dptr<IType>(),
+                                     M, trailing, Shape3(leading, M, trailing),
+                                     Shape3(leading, 1, trailing));
+      }
+    });
+  });
+}
+
+template<typename xpu>
+void PickOpBackward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const ReduceAxisParam& param = nnvm::get<ReduceAxisParam>(attrs.parsed);
+
+  const TShape& ishape = outputs[0].shape_;
+  int axis = CheckAxis(param.axis.value(), ishape.ndim());
+  int leading = 1, trailing = 1, M = ishape[axis];
+  for (index_t i = 0; i < axis; ++i) leading *= ishape[i];
+  for (index_t i = axis+1; i < ishape.ndim(); ++i) trailing *= ishape[i];
+
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output type
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index type
+      if (req[0] != kAddTo) outputs[0].FlatTo1D<xpu, DType>(s) = 0;
+      if (trailing == 1) {
+        Kernel<pick_grad<2>, xpu>::Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
+                                     inputs[0].dptr<DType>(), inputs[1].dptr<IType>(),
+                                     M, 1, Shape2(leading, M), Shape2(leading, 1));
+      } else {
+        Kernel<pick_grad<3>, xpu>::Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
+                                     inputs[0].dptr<DType>(), inputs[1].dptr<IType>(),
+                                     M, trailing, Shape3(leading, M, trailing),
+                                     Shape3(leading, 1, trailing));
+      }
+    });
+  });
+}
+
 #define MXNET_OPERATOR_REGISTER_REDUCE_AXIS(name)               \
   NNVM_REGISTER_OP(name)                                        \
   .set_num_inputs(1)                                            \
@@ -461,7 +619,7 @@ void L2NormCompute(const nnvm::NodeAttrs& attrs,
   .set_attr_parser(ParamParser<ReduceAxisParam>)                \
   .set_attr<nnvm::FInferShape>("FInferShape", ReduceAxisShape)  \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
-  .add_argument("data", "NDArray", "Source input")               \
+  .add_argument("data", "ndarray-or-symbol", "The input")       \
   .add_arguments(ReduceAxisParam::__FIELDS__())
 
 #define MXNET_OPERATOR_REGISTER_REDUCE(name)                    \
@@ -471,7 +629,7 @@ void L2NormCompute(const nnvm::NodeAttrs& attrs,
   .set_attr_parser(AxesParamParser<ReduceAxesParam>)            \
   .set_attr<nnvm::FInferShape>("FInferShape", ReduceAxesShape)  \
   .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>) \
-  .add_argument("data", "NDArray", "Source input")               \
+  .add_argument("data", "ndarray-or-symbol", "The input")       \
   .add_arguments(ReduceAxesParam::__FIELDS__())
 
 #define MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(name)               \
@@ -488,10 +646,10 @@ void L2NormCompute(const nnvm::NodeAttrs& attrs,
   .set_attr<nnvm::FGradient>("FGradient",                       \
     [](const nnvm::NodePtr& n,                                  \
        const std::vector<nnvm::NodeEntry>& ograds) {            \
-      return MakeGradNode("_broadcast_backward", n, ograds,     \
-                          {{"keepdims", "true"}});              \
+      return MakeNonlossGradNode("_broadcast_backward", n, ograds, {},    \
+                                 {{"keepdims", "true"}});              \
     })                                                          \
-  .add_argument("data", "NDArray", "Source input")
+  .add_argument("data", "ndarray-or-symbol", "The input")
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index da51019ba608..3cb5ed64a05c 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -8,12 +8,12 @@
 namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_REDUCE_AXIS(argmax)
-.MXNET_DESCRIBE("Compute argmax")
+.MXNET_DESCRIBE("Returns the indices of the maximum values along an axis.")
 .set_attr<FCompute>("FCompute<cpu>", SearchAxisCompute<cpu, mshadow::red::maximum>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_REDUCE_AXIS(argmin)
-.MXNET_DESCRIBE("Compute argmin")
+.MXNET_DESCRIBE("Returns the indices of the minimum values along an axis.")
 .set_attr<FCompute>("FCompute<cpu>", SearchAxisCompute<cpu, mshadow::red::minimum>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
@@ -30,7 +30,39 @@ NNVM_REGISTER_OP(argmax_channel)
 .set_attr<nnvm::FInferShape>("FInferShape", ReduceAxisShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<FCompute>("FCompute<cpu>", SearchAxisCompute<cpu, mshadow::red::maximum>)
-.add_argument("src", "NDArray", "Source input");
+.add_argument("src", "ndarray-or-symbol", "Source input");
+
+NNVM_REGISTER_OP(pick)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ReduceAxisParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "index"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", PickOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PickOpType)
+.set_attr<FCompute>("FCompute<cpu>", PickOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    auto ret = MakeNonlossGradNode("_backward_pick", n, ograds,
+                                   {n->inputs[1]}, n->attrs.dict);
+    auto p = MakeNode("zeros_like", n->attrs.name + "_index_backward",
+                      {n->inputs[1]}, nullptr, &n);
+    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    return ret;
+  })
+.add_argument("data", "NDArray", "Source input")
+.add_argument("index", "NDArray", "Index array")
+.add_arguments(ReduceAxisParam::__FIELDS__());
+
+
+NNVM_REGISTER_OP(_backward_pick)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ReduceAxisParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", PickOpBackward<cpu>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cu b/src/operator/tensor/broadcast_reduce_op_index.cu
index 720efb112a77..e07b3a2b66d7 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cu
+++ b/src/operator/tensor/broadcast_reduce_op_index.cu
@@ -17,5 +17,12 @@ NNVM_REGISTER_OP(argmin)
 NNVM_REGISTER_OP(argmax_channel)
 .set_attr<FCompute>("FCompute<gpu>", SearchAxisCompute<gpu, mshadow::red::maximum>);
 
+NNVM_REGISTER_OP(pick)
+.set_attr<FCompute>("FCompute<gpu>", PickOpForward<gpu>);
+
+
+NNVM_REGISTER_OP(_backward_pick)
+.set_attr<FCompute>("FCompute<gpu>", PickOpBackward<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index d373b3904fde..2540d507bdfa 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -12,9 +12,36 @@ DMLC_REGISTER_PARAMETER(ReduceAxisParam);
 DMLC_REGISTER_PARAMETER(BroadcastAxesParam);
 DMLC_REGISTER_PARAMETER(BroadcastToParam);
 
+inline std::string get_reduce_axes_description(const std::string& op_name, int line) {
+  std::string doc = R"code(Compute the __op__ of array elements over given axes.
+
+The argument ``axis`` specifies the axes to compute over:
+
+- **()**: compute over all elements into a scalar array with shape ``(1,)``. This is
+  the default option.
+- **int**: compute over along a particular axis. If input has shape ``(n, m, k)``,
+  use ``axis=0`` will result in an array with shape ``(m, k)``.
+- **tuple of int**: compute over multiple axes. Again assume input shape ``(n, m,
+  k)``, with ``axis=(0,2)`` we obtain a ``(m,)`` shape array.
+
+If ``keepdims = 1``, then the result array will has the same number of dimensions
+as the input, while the reduced axes will have size 1.
+
+
+Defined in )code";
+  doc += std::string(__FILE__) + std::string(":L") + std::to_string(line);
+  size_t pos = 0;
+  std::string holder("__op__");
+  while ((pos = doc.find(holder, pos)) != std::string::npos) {
+    doc.replace(pos, holder.length(), op_name);
+    pos += op_name.length();
+  }
+  return doc;
+}
+
 MXNET_OPERATOR_REGISTER_REDUCE(sum)
 .add_alias("sum_axis")
-.MXNET_DESCRIBE("Sum src along axis. If axis is empty, global reduction is performed")
+.describe(get_reduce_axes_description("sum", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_sum"});
 
@@ -23,7 +50,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_sum)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(mean)
-.MXNET_DESCRIBE("Compute mean src along axis. If axis is empty, global reduction is performed")
+.describe(get_reduce_axes_description("mean", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum, true>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mean"});
 
@@ -32,8 +59,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_mean)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu, true>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(prod)
-.MXNET_DESCRIBE("Compute product of src along axis. "
-"If axis is empty, global reduction is performed")
+.describe(get_reduce_axes_description("product", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute< cpu, mshadow_op::product>)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_prod" });
 
@@ -42,8 +68,11 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_prod)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut< cpu, mshadow_op::rdiv>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(nansum)
-.MXNET_DESCRIBE("Sum src along axis, ignoring NaN values. "
-"If axis is empty, global reduction is performed")
+.describe(R"code(Compute the sum of array elements over given axes with ``NaN`` ignored
+
+Refer to ``sum`` for more details.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nansum>)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_nansum" });
 
@@ -52,8 +81,11 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nansum)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::nansum_grad>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(nanprod)
-.MXNET_DESCRIBE("Compute product of src along axis, ignoring NaN values. "
-"If axis is empty, global reduction is performed")
+.describe(R"code(Compute the product of array elements over given axes with ``NaN`` ignored
+
+Refer to ``prod`` for more details.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow_op::nanprod>)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{ "_backward_nanprod" });
 
@@ -63,7 +95,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_nanprod)
 
 MXNET_OPERATOR_REGISTER_REDUCE(max)
 .add_alias("max_axis")
-.MXNET_DESCRIBE("Compute max along axis. If axis is empty, global reduction is performed")
+.describe(get_reduce_axes_description("max", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::maximum>)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_max"});
 
@@ -73,7 +105,7 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_max)
 
 MXNET_OPERATOR_REGISTER_REDUCE(min)
 .add_alias("min_axis")
-.MXNET_DESCRIBE("Compute min along axis. If axis is empty, global reduction is performed")
+.describe(get_reduce_axes_description("min", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::minimum>)
 .set_attr<nnvm::FGradient>("FGradient", ReduceGrad{"_backward_min"});
 
@@ -82,14 +114,48 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_min)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseInOut<cpu, mshadow_op::eq>);
 
 MXNET_OPERATOR_REGISTER_BROADCAST(broadcast_axis)
-.MXNET_DESCRIBE("Broadcast src along axis")
+.add_alias("broadcast_axes")
+.describe(R"code(Broadcast an array over particular axes.
+
+Broadcasting is allowed on axes which size 1, such as from ``(2,1,3,1)`` to
+``(2,8,3,9)``. Elemenets will be duplicated on the broadcasted axes.
+
+For example::
+
+   // given (1,2,1) shape x
+   x = [[[ 1.],
+         [ 2.]]]
+
+   // broadcast on axis 2
+   broadcast_axis(x, axis=2, size=3) = [[[ 1.,  1.,  1.],
+                                         [ 2.,  2.,  2.]]]
+   // broadcast on axes 0 and 2
+   broadcast_axis(x, axis=(0,2), size=(2,3)) = [[[ 1.,  1.,  1.],
+                                                 [ 2.,  2.,  2.]],
+                                                [[ 1.,  1.,  1.],
+                                                 [ 2.,  2.,  2.]]]
+)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<BroadcastAxesParam>)
 .add_arguments(BroadcastAxesParam::__FIELDS__())
 .set_attr<nnvm::FInferShape>("FInferShape", BroadcastAxesShape)
 .set_attr<FCompute>("FCompute<cpu>", BroadcastCompute<cpu>);
 
 MXNET_OPERATOR_REGISTER_BROADCAST(broadcast_to)
-.MXNET_DESCRIBE("Broadcast src to shape")
+.describe(R"code(Broadcast an array to a new shape.
+
+Broadcasting is allowed on axes which size 1, such as from ``(2,1,3,1)`` to
+``(2,8,3,9)``. Elemenets will be duplicated on the broadcasted axes.
+
+For example::
+
+   broadcast_to([[1,2,3]], shape=(2,3)) = [[ 1.,  2.,  3.],
+                                           [ 1.,  2.,  3.]])
+
+The dimensions that will not be changed can also use the special code ``0`` that
+means copy the original value. So with ``shape=(2,0)`` we will obtain the same
+results in the above example.
+
+)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<BroadcastToParam>)
 .add_arguments(BroadcastToParam::__FIELDS__())
 .set_attr<nnvm::FInferShape>("FInferShape", BroadcastToShape)
@@ -102,21 +168,33 @@ NNVM_REGISTER_OP(_broadcast_backward)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>);
 
 NNVM_REGISTER_OP(norm)
+.describe(R"code(Compute the L2 norm.
+
+Flatten then input array and then compute the l2 norm.
+
+Examples::
+
+  x = [[1, 2],
+       [3, 4]]
+
+  norm(x) = [5.47722578]
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape",
   [](const nnvm::NodeAttrs& attrs,
      std::vector<TShape> *in_attrs,
      std::vector<TShape> *out_attrs) {
-    CHECK_EQ(in_attrs->size(), 1);
-    CHECK_EQ(out_attrs->size(), 1);
+    CHECK_EQ(in_attrs->size(), 1U);
+    CHECK_EQ(out_attrs->size(), 1U);
     if ((*in_attrs)[0].ndim() == 0) return false;
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
     return true;
   })
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<FCompute>("FCompute<cpu>", L2NormCompute<cpu>)
-.add_argument("src", "NDArray", "Source input");
+.add_argument("src", "ndarray-or-symbol", "Source input");
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/control_flow_op.cc b/src/operator/tensor/control_flow_op.cc
index 8332327de8c0..ab6f4a4c29ca 100644
--- a/src/operator/tensor/control_flow_op.cc
+++ b/src/operator/tensor/control_flow_op.cc
@@ -35,13 +35,8 @@ NNVM_REGISTER_OP(where)
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     std::vector<nnvm::NodeEntry> ret;
     // make zero grad node for grad[condition]
-    nnvm::NodePtr p = nnvm::Node::Create();
-    p->attrs.op = nnvm::Op::Get("_zeros");
-    std::ostringstream os;
-    os << n->attrs.name << "_in" << 0 << "_backward";
-    p->attrs.name = os.str();
-    p->attrs.dict = std::unordered_map<std::string, std::string>();
-    p->control_deps.emplace_back(n);
+    auto p = MakeNode("zeros_like", n->attrs.name + "_cond_backward",
+                      {n->inputs[0]}, nullptr, &n);
     ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
 
     // make grad nodes for grad[x] and grad[y]
diff --git a/src/operator/tensor/control_flow_op.h b/src/operator/tensor/control_flow_op.h
index 889efd11720c..0ab24899042d 100644
--- a/src/operator/tensor/control_flow_op.h
+++ b/src/operator/tensor/control_flow_op.h
@@ -92,9 +92,9 @@ struct where_batch_backward {
 inline bool WhereOpShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape>* in_attrs,
                          std::vector<TShape>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 3)
+  CHECK_EQ(in_attrs->size(), 3U)
     << "where operator takes 3 arguments (" << in_attrs->size() << " given)";
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1U);
 
   TShape tshape((*in_attrs)[1]);
   if (!shape_assign(&tshape, (*in_attrs)[2])) return false;
@@ -116,9 +116,9 @@ inline bool WhereOpShape(const nnvm::NodeAttrs& attrs,
 inline bool WhereOpType(const nnvm::NodeAttrs& attrs,
                         std::vector<int>* in_attrs,
                         std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 3)
+  CHECK_EQ(in_attrs->size(), 3U)
     << "where operator takes 3 arguments (" << in_attrs->size() << " given)";
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1U);
 
   int dtype = -1;
   if (!type_assign(&dtype, (*in_attrs)[1])) return false;
@@ -139,9 +139,9 @@ void WhereOpForward(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 3);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
   using namespace mxnet_op;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& cond = inputs[0];
@@ -183,9 +183,9 @@ void WhereOpBackward(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 2);
-  CHECK_EQ(req.size(), 2);
-  CHECK_EQ(outputs.size(), 2);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(req.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U);
   using namespace mxnet_op;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   const TBlob& grad_in = inputs[0];
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index fe0e0f219081..d79fde142cb3 100755
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -22,8 +22,8 @@ namespace op {
 inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
                                  std::vector<TShape> *in_attrs,
                                  std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TShape& lhs = (*in_attrs)[0];
   TShape& rhs = (*in_attrs)[1];
 
@@ -57,6 +57,20 @@ inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+#define BROADCAST_NDIM_SWITCH(ndim, NDim, ...)  \
+  if (ndim <= 2) {                    \
+    const int NDim = 2;               \
+    {__VA_ARGS__}                     \
+  } else if (ndim <= 4) {             \
+    const int NDim = 4;               \
+    {__VA_ARGS__}                     \
+  } else if (ndim <= broadcast::MAX_DIM) {  \
+    const int NDim = broadcast::MAX_DIM;    \
+    {__VA_ARGS__}                     \
+  } else {                            \
+    LOG(FATAL) << "NDim too large ";  \
+  }
+
 inline int BinaryBroadcastShapeCompact(const TShape& lshape, const TShape& rshape,
                                        const TShape& oshape, TShape *new_lshape,
                                        TShape *new_rshape, TShape *new_oshape) {
@@ -90,9 +104,11 @@ inline int BinaryBroadcastShapeCompact(const TShape& lshape, const TShape& rshap
     ++j;
   }
   if (j <= broadcast::MAX_DIM) {
-    new_lshape->assign(&(*new_lshape)[0], &(*new_lshape)[broadcast::MAX_DIM]);
-    new_rshape->assign(&(*new_rshape)[0], &(*new_rshape)[broadcast::MAX_DIM]);
-    new_oshape->assign(&(*new_oshape)[0], &(*new_oshape)[broadcast::MAX_DIM]);
+    BROADCAST_NDIM_SWITCH(j, NDim, {
+      new_lshape->assign(&(*new_lshape)[0], &(*new_lshape)[NDim]);
+      new_rshape->assign(&(*new_rshape)[0], &(*new_rshape)[NDim]);
+      new_oshape->assign(&(*new_oshape)[0], &(*new_oshape)[NDim]);
+    });
   } else {
     LOG(FATAL) << "Too many broadcast dimensions with operands " << lshape << " " << rshape;
   }
@@ -114,8 +130,10 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
   } else {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      BinaryBroadcastComputeImpl<DType, OP>(s, req[0], inputs[0].reshape(new_lshape),
-        inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape));
+      BROADCAST_NDIM_SWITCH(ndim, NDim, {
+        BinaryBroadcastComputeImpl<NDim, DType, OP>(s, req[0], inputs[0].reshape(new_lshape),
+          inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape));
+      });
     });
   }
 }
@@ -174,7 +192,6 @@ void BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
                                     const std::vector<TBlob>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<TBlob>& outputs) {
-  // LOG(INFO) << attrs.name;
   using namespace broadcast;
   TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
@@ -187,14 +204,16 @@ void BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
       const TBlob lhs = outputs[0].reshape(new_lshape);
       const TBlob rhs = outputs[1].reshape(new_rshape);
       const TBlob out = inputs[0].reshape(new_oshape);
-      // Request temporary storage
-      size_t workspace_size_l = ReduceWorkspaceSize<red::sum, DType, LOP>(s, lhs, req[0], out);
-      size_t workspace_size_r = ReduceWorkspaceSize<red::sum, DType, ROP>(s, rhs, req[1], out);
-      size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
-      Tensor<xpu, 1, char> workspace =
-        ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
-      Reduce<red::sum, DType, LOP>(s, lhs, req[0], out, workspace);
-      Reduce<red::sum, DType, ROP>(s, rhs, req[1], out, workspace);
+      BROADCAST_NDIM_SWITCH(ndim, NDim, {
+        // Request temporary storage
+        size_t workspace_size_l = ReduceWorkspaceSize<NDim, DType>(s, lhs, req[0], out);
+        size_t workspace_size_r = ReduceWorkspaceSize<NDim, DType>(s, rhs, req[1], out);
+        size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
+        Tensor<xpu, 1, char> workspace =
+          ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+        Reduce<red::sum, NDim, DType, LOP>(s, lhs, req[0], workspace, out);
+        Reduce<red::sum, NDim, DType, ROP>(s, rhs, req[1], workspace, out);
+      });
     });
   }
 }
@@ -209,21 +228,22 @@ inline void BinaryBroadcastBackwardUseInImpl(const OpContext& ctx,
                                              const TShape& new_oshape) {
   using namespace mshadow;
   using namespace mshadow::expr;
+  using namespace broadcast;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, ndim, DType> ograd =
-    inputs[0].get_with_shape<xpu, ndim, DType>(new_oshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> lhs =
-    inputs[1].get_with_shape<xpu, ndim, DType>(new_lshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> rhs =
-    inputs[2].get_with_shape<xpu, ndim, DType>(new_rshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> lgrad =
-    outputs[0].get_with_shape<xpu, ndim, DType>(new_lshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> rgrad =
-    outputs[1].get_with_shape<xpu, ndim, DType>(new_rshape.get<ndim>(), s);
-  ReduceToAssign<red::sum>(lgrad, req[0],
-    ograd*F<LOP>(broadcast_to(lhs, new_oshape), broadcast_to(rhs, new_oshape)));
-  ReduceToAssign<red::sum>(rgrad, req[1],
-    ograd*F<ROP>(broadcast_to(lhs, new_oshape), broadcast_to(rhs, new_oshape)));
+  const TBlob lgrad = outputs[0].reshape(new_lshape);
+  const TBlob rgrad = outputs[1].reshape(new_rshape);
+  const TBlob ograd = inputs[0].reshape(new_oshape);
+  const TBlob lhs = inputs[1].reshape(new_lshape);
+  const TBlob rhs = inputs[2].reshape(new_rshape);
+  size_t workspace_size_l = ReduceWorkspaceSize<ndim, DType>(s, lgrad, req[0], ograd, lhs, rhs);
+  size_t workspace_size_r = ReduceWorkspaceSize<ndim, DType>(s, rgrad, req[1], ograd, lhs, rhs);
+  size_t workspace_size = std::max(workspace_size_l, workspace_size_r);
+  Tensor<xpu, 1, char> workspace =
+    ctx.requested[0].get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+  Reduce<red::sum, ndim, DType, mshadow::op::mul, LOP>(s, lgrad, req[0], workspace,
+    ograd, lhs, rhs);
+  Reduce<red::sum, ndim, DType, mshadow::op::mul, ROP>(s, rgrad, req[0], workspace,
+    ograd, lhs, rhs);
 }
 
 template<typename xpu, typename LOP, typename ROP>
@@ -239,63 +259,15 @@ void BinaryBroadcastBackwardUseIn(const nnvm::NodeAttrs& attrs,
     BinaryBackwardUseIn<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      if (new_oshape.ndim() == 2) {
-        BinaryBroadcastBackwardUseInImpl<xpu, 2, DType, LOP, ROP>(
+      BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
+        BinaryBroadcastBackwardUseInImpl<xpu, NDim, DType, LOP, ROP>(
           ctx, inputs, req, outputs, new_lshape, new_rshape, new_oshape);
-      } else {
-        BinaryBroadcastBackwardUseInImpl<xpu, broadcast::MAX_DIM, DType, LOP, ROP>(
-          ctx, inputs, req, outputs, new_lshape, new_rshape, new_oshape);
-      }
+      });
     });
   }
 }
 
-template<typename xpu, int ndim, typename DType, typename LOP, typename ROP>
-inline void BinaryBroadcastBackwardUseOutImpl(const OpContext& ctx,
-                                              const std::vector<TBlob>& inputs,
-                                              const std::vector<OpReqType>& req,
-                                              const std::vector<TBlob>& outputs,
-                                              const TShape& new_lshape,
-                                              const TShape& new_rshape,
-                                              const TShape& new_oshape) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, ndim, DType> ograd =
-    inputs[0].get_with_shape<xpu, ndim, DType>(new_oshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> out =
-    inputs[1].get_with_shape<xpu, ndim, DType>(new_oshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> lgrad =
-    outputs[0].get_with_shape<xpu, ndim, DType>(new_lshape.get<ndim>(), s);
-  Tensor<xpu, ndim, DType> rgrad =
-    outputs[1].get_with_shape<xpu, ndim, DType>(new_rshape.get<ndim>(), s);
-  ReduceToAssign<red::sum>(lgrad, req[0], ograd*F<LOP>(out));
-  ReduceToAssign<red::sum>(rgrad, req[1], ograd*F<ROP>(out));
-}
-
-template<typename xpu, typename LOP, typename ROP>
-void BinaryBroadcastBackwardUseOut(const nnvm::NodeAttrs& attrs,
-                                   const OpContext& ctx,
-                                   const std::vector<TBlob>& inputs,
-                                   const std::vector<OpReqType>& req,
-                                   const std::vector<TBlob>& outputs) {
-  TShape new_lshape, new_rshape, new_oshape;
-  bool need_bc = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
-                                             &new_lshape, &new_rshape, &new_oshape);
-  if (!need_bc) {
-    BinaryBackwardUseOut<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
-  } else {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      if (new_oshape.ndim() == 2) {
-        BinaryBroadcastBackwardUseOutImpl<xpu, 2, DType, LOP, ROP>(
-          ctx, inputs, req, outputs, new_lshape, new_rshape, new_oshape);
-      } else {
-        BinaryBroadcastBackwardUseOutImpl<xpu, broadcast::MAX_DIM, DType, LOP, ROP>(
-          ctx, inputs, req, outputs, new_lshape, new_rshape, new_oshape);
-      }
-    });
-  }
-}
+#undef BROADCAST_NDIM_SWITCH
 
 #define MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(name)                \
   NNVM_REGISTER_OP(name)                                              \
@@ -311,8 +283,8 @@ void BinaryBroadcastBackwardUseOut(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                       \
       return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};       \
     })                                                                \
-  .add_argument("lhs", "NDArray", "first input")                      \
-  .add_argument("rhs", "NDArray", "second input")
+  .add_argument("lhs", "ndarray-or-symbol", "first input")                      \
+  .add_argument("rhs", "ndarray-or-symbol", "second input")
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
index 3349786aa439..918bf64b683a 100755
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
@@ -11,6 +11,9 @@ namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_add)
 .add_alias("broadcast_plus")
+.describe(R"code(Add arguments, element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow::op::plus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_add"});
 
@@ -31,6 +34,9 @@ NNVM_REGISTER_OP(_backward_broadcast_add)
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_sub)
 .add_alias("broadcast_minus")
+.describe(R"code(Substract arguments, element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow::op::minus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_broadcast_sub"});
 
@@ -50,6 +56,9 @@ NNVM_REGISTER_OP(_backward_broadcast_sub)
                                                                 mshadow_op::negation>);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_mul)
+.describe(R"code(Multiply arguments, element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow::op::mul>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
 
@@ -61,10 +70,17 @@ NNVM_REGISTER_OP(_backward_broadcast_mul)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 1}};
   })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, mshadow_op::right,
                                                               mshadow_op::left>);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_div)
+.describe(R"code(Divide arguments, element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow::op::div>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_div"});
 
@@ -76,6 +92,10 @@ NNVM_REGISTER_OP(_backward_broadcast_div)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 1}};
   })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, mshadow_op::div_grad,
                                                               mshadow_op::div_rgrad>);
 
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
old mode 100644
new mode 100755
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
old mode 100644
new mode 100755
index abb8f452cb05..92d800813aa2
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
@@ -10,6 +10,10 @@
 namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_power)
+.describe(R"code(First array elements raised to powers from second array,
+element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::power>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_power"});
 
@@ -21,10 +25,17 @@ NNVM_REGISTER_OP(_backward_broadcast_power)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 1}};
   })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, mshadow_op::power_grad,
                                                               mshadow_op::power_rgrad>);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_maximum)
+.describe(R"code(Element-wise maximum of array elements with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::maximum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_maximum"});
 
@@ -36,10 +47,17 @@ NNVM_REGISTER_OP(_backward_broadcast_maximum)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 1}};
   })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, mshadow_op::ge,
                                                               mshadow_op::lt>);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_minimum)
+.describe(R"code(Element-wise minimum of array elements with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::minimum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_minimum"});
 
@@ -51,10 +69,18 @@ NNVM_REGISTER_OP(_backward_broadcast_minimum)
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 1}};
   })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, mshadow_op::le,
                                                               mshadow_op::gt>);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_hypot)
+.describe(R"code(Given the "legs" of a right triangle, return its hypotenuse
+with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::hypot>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_broadcast_hypot" });
 
@@ -66,6 +92,10 @@ NNVM_REGISTER_OP(_backward_broadcast_hypot)
 [](const NodeAttrs& attrs) {
   return std::vector<std::pair<int, int> > {{0, 1}};
 })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastBackwardUseIn<cpu, mshadow_op::hypot_grad_left,
                     mshadow_op::hypot_grad_right>);
 
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index 946e862383ec..cbec00f4fe16 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -11,26 +11,44 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_equal)
+.describe(R"code(Return (lhs == rhs), element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::eq>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_not_equal)
+.describe(R"code(Return (lhs != rhs), element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::ne>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater)
+.describe(R"code(Return (lhs > rhs), element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::gt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_greater_equal)
+.describe(R"code(Return (lhs >= rhs), element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::ge>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser)
+.describe(R"code(Return (lhs < rhs), element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::lt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY_BROADCAST(broadcast_lesser_equal)
+.describe(R"code(Return (lhs <= rhs), element-wise with broadcasting.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow_op::le>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 8ac6869a31ac..dce9156ce666 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -103,8 +103,8 @@ void BinaryBackwardUseIn(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
     })                                                              \
-  .add_argument("lhs", "NDArray", "first input")                    \
-  .add_argument("rhs", "NDArray", "second input")
+  .add_argument("lhs", "ndarray-or-symbol", "first input")                    \
+  .add_argument("rhs", "ndarray-or-symbol", "second input")
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cc b/src/operator/tensor/elemwise_binary_op_extended.cc
index 37e77071cc17..c1669c6daf6e 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_op_extended.cc
@@ -58,6 +58,9 @@ NNVM_REGISTER_OP(_backward_minimum)
 
 MXNET_OPERATOR_REGISTER_BINARY(_hypot)
 .add_alias("_Hypot")
+.describe(R"code(Given the "legs" of a right triangle, return its hypotenuse.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::hypot>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_hypot" });
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index cf56d0cd0857..a29b575c9c76 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -62,7 +62,7 @@ void BinaryScalarBackward(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}};             \
     })                                                              \
-  .add_argument("data", "NDArray", "source input")                   \
+  .add_argument("data", "ndarray-or-symbol", "source input")                   \
   .add_argument("scalar", "float", "scalar input")
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 65598a3ba320..b26b057b48ec 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -52,8 +52,15 @@ bool ElementWiseSumType(const nnvm::NodeAttrs& attrs,
     attrs, in_attrs, out_attrs, -1);
 }
 
-NNVM_REGISTER_OP(ElementWiseSum)
-.MXNET_DESCRIBE("Perform element sum of inputs")
+NNVM_REGISTER_OP(add_n)
+.add_alias("ElementWiseSum")
+.describe(R"doc(Add all input arguments element-wise.
+
+.. math::
+   add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
+
+``add_n`` is potentially more efficient than calling ``add`` by `n` times.
+)doc" ADD_FILELINE)
 .set_attr_parser(ParamParser<ElementWiseSumParam>)
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {
     uint32_t ret = dmlc::get<ElementWiseSumParam>(attrs.parsed).num_args;
@@ -77,7 +84,7 @@ NNVM_REGISTER_OP(ElementWiseSum)
 .set_attr<nnvm::FInferShape>("FInferShape", ElementWiseSumShape)
 .set_attr<nnvm::FInferType>("FInferType", ElementWiseSumType)
 .set_attr<nnvm::FGradient>("FGradient", ElementWiseSumGrad)
-.add_argument("args", "NDArray[]", "List of input tensors");
+.add_argument("args", "ndarray-or-symbol[]", "Positional input arguments");
 
 
 
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index d073fa2e462a..d77e83409f0f 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -8,7 +8,7 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(ElementWiseSum)
+NNVM_REGISTER_OP(add_n)
 .set_attr<FCompute>("FCompute<gpu>", ElementWiseSumCompute<gpu>);
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h
index 13fbfad18d37..9b1fe4a53c53 100644
--- a/src/operator/tensor/elemwise_sum.h
+++ b/src/operator/tensor/elemwise_sum.h
@@ -68,7 +68,7 @@ void ElementWiseSumCompute(const nnvm::NodeAttrs& attrs,
                            const std::vector<TBlob>& inputs,
                            const std::vector<OpReqType>& req,
                            const std::vector<TBlob>& outputs) {
-  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1U);
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       ElementWiseSumCompute_<xpu, DType>(attrs, ctx, inputs, req, outputs);
     });
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index a337d5af98d8..5389acb77893 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -28,10 +28,26 @@ NNVM_REGISTER_OP(_backward_copy)
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>);
 
 MXNET_OPERATOR_REGISTER_UNARY(BlockGrad)
+.add_alias("stop_gradient")
 .MXNET_DESCRIBE("Get output from a symbol and pass 0 gradient back")
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
+MXNET_OPERATOR_REGISTER_UNARY(make_loss)
+.MXNET_DESCRIBE("Get output from a symbol and pass 1 gradient back. "
+  "This is used as a terminal loss if unary and binary operator "
+  "are used to composite a loss with no declaration of backward "
+  "dependency")
+.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    auto p = MakeNode("ones_like", n->attrs.name + "_backward",
+                      &(n->inputs), nullptr, &n);
+    std::vector<nnvm::NodeEntry> ret;
+    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    return ret;
+  });
+
 // identity output as first input, but attributes are constrainted to be like rhs
 NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
 .set_num_inputs(2)
@@ -39,23 +55,32 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
     "FInplaceOption", [](const NodeAttrs& attrs) {
       return std::vector<std::pair<int, int> >{{0, 0}};
     })
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+    [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FGradient>(
     "FGradient",  [](const nnvm::NodePtr& n,
                      const std::vector<nnvm::NodeEntry>& ograds) {
-      auto lhs = MakeGradNode("_backward_copy", n, ograds,
-                              std::unordered_map<std::string, std::string>());
-      nnvm::NodePtr ng = nnvm::Node::Create();
-      ng->attrs.op = nnvm::Op::Get("_zeros");
-      ng->attrs.name = "zeros";
+      auto lhs = MakeNonlossGradNode(
+          "_backward_copy", n, ograds, {},
+          std::unordered_map<std::string, std::string>());
+      auto ng = MakeNode("zeros_like", n->attrs.name + "rhs_backward",
+                         {n->inputs[1]}, nullptr, &n);
       lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
       return lhs;
     });
 
 NNVM_REGISTER_OP(Cast)
 .add_alias("cast")
-.MXNET_DESCRIBE("Convert data type to dtype")
+.describe(R"code(Cast to a specified type, element-wise.
+
+For example::
+
+   cast([1e20, 11.1], dtype='float16') = [inf, 11.09375]
+   cast([300, 11.1, 10.9, -1, -3], dtype='uint8') = [44, 11, 10, 255, 253]
+
+)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<CastParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", CastType)
@@ -65,7 +90,7 @@ NNVM_REGISTER_OP(Cast)
   })
 .set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_cast"})
-.add_argument("data", "NDArray", "Source input")
+.add_argument("data", "ndarray-or-symbol", "Source input")
 .add_arguments(CastParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_cast)
@@ -80,7 +105,12 @@ MXNET_OPERATOR_REGISTER_UNARY(negative)
 
 // abs
 MXNET_OPERATOR_REGISTER_UNARY(abs)
-.MXNET_DESCRIBE("Take absolute value of the src")
+.describe(R"code(Returns the absolute value of array elements, element-wise.
+
+For example:
+   abs([-2, 0, 3]) = [2, 0, 3]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::abs>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_abs"});
 
@@ -89,7 +119,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_abs)
 
 // sign
 MXNET_OPERATOR_REGISTER_UNARY(sign)
-.MXNET_DESCRIBE("Take sign of the src")
+.describe(R"code(Returns the indication sign of array elements, element-wise.
+
+For example::
+   sign([-2, 0, 3]) = [-1, 0, 1]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::sign>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
 
@@ -98,33 +133,63 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_sign)
 
 // round
 MXNET_OPERATOR_REGISTER_UNARY(round)
-.MXNET_DESCRIBE("Take round of the src")
+.describe(R"code(Round elements of the array to the nearest integer, element-wise.
+
+For example::
+   round([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -2.,  2.,  2.,  2.]
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::round>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 // ceil
 MXNET_OPERATOR_REGISTER_UNARY(ceil)
-.MXNET_DESCRIBE("Take ceil of the src")
+.describe(R"code(Return the ceiling of the input, element-wise.
+
+For example::
+   ceil([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  2.,  2.,  3.]
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::ceil>);
 
 // floor
 MXNET_OPERATOR_REGISTER_UNARY(floor)
-.MXNET_DESCRIBE("Take floor of the src")
+.describe(R"code(Return the floor of the input, element-wise.
+
+For example::
+   floor([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-3., -2.,  1.,  1.,  2.]
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::floor>);
 
 // rint
 MXNET_OPERATOR_REGISTER_UNARY(rint)
-.MXNET_DESCRIBE("Take round of the src to nearest integer")
+.describe(R"code(Round elements of the array to the nearest integer, element-wise.
+
+For example::
+   rint([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -2.,  1.,  2.,  2.]
+
+The difference to ``round`` is that ``rint`` returns ``n`` for input ``n.5``
+while ``round`` returns ``n+1`` for ``n>=0``.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::rint>);
 
 // fix
 MXNET_OPERATOR_REGISTER_UNARY(fix)
-.MXNET_DESCRIBE("Take round of the src to integer nearest 0")
+.describe(R"code(Round elements of the array to the nearest integer towards
+zero, element-wise.
+
+For example::
+   fix([-2.1, -1.9, 1.9, 2.1]) = [-2., -1.,  1., 2.]
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::fix>);
 
 // square
 MXNET_OPERATOR_REGISTER_UNARY(square)
-.MXNET_DESCRIBE("Take square of the src")
+.describe(R"code(Calculate the square of an array, element-wise.
+
+For example::
+   square(x) = x^2
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::square>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
 
@@ -133,7 +198,11 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_square)
 
 // sqrt
 MXNET_OPERATOR_REGISTER_UNARY(sqrt)
-.MXNET_DESCRIBE("Take square root of the src")
+.describe(R"code(Calculate the square-root of an array, element-wise.
+
+For example::
+   sqrt(x) = \sqrt{x}
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::square_root>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sqrt"});
 
@@ -142,7 +211,11 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_sqrt)
 
 // rsqrt
 MXNET_OPERATOR_REGISTER_UNARY(rsqrt)
-.MXNET_DESCRIBE("Take reciprocal square root of the src")
+.describe(R"code(Calculate the inverse square-root of an array, element-wise.
+
+For example::
+   rsqrt(x) = 1/\sqrt{x}
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::reciprocal_square_root>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rsqrt"});
 
@@ -152,25 +225,42 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_rsqrt)
 
 // exp
 MXNET_OPERATOR_REGISTER_UNARY(exp)
-.MXNET_DESCRIBE("Take exp of the src")
+.describe(R"code(Calculate the exponential of the array, element-wise
+
+For example::
+   exp(x) = e^x \approx 2.718^x
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::exp>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
 
 // log
 MXNET_OPERATOR_REGISTER_UNARY(log)
-.MXNET_DESCRIBE("Take log of the src")
+.describe(R"code(Natural logarithm, element-wise.
+
+The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log10
 MXNET_OPERATOR_REGISTER_UNARY(log10)
-.MXNET_DESCRIBE("Take base-10 log of the src")
+.describe(R"code(Calculate the base 10 logarithm of the array, element-wise.
+
+``10**log10(x) = x``
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log10>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
 // log2
 MXNET_OPERATOR_REGISTER_UNARY(log2)
-.MXNET_DESCRIBE("Take base-2 log of the src")
+.describe(R"code(Calculate the base 2 logarithm of the array, element-wise.
+
+``2**log2(x) = x``
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log2>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
 
@@ -179,7 +269,14 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_log)
 
 // sin
 MXNET_OPERATOR_REGISTER_UNARY(sin)
-.MXNET_DESCRIBE("Take sin of the src")
+.describe(R"code(Trigonometric sine, element-wise.
+
+Then input is in radians (:math:`2\pi` rad equals 360 degress).
+
+.. math::
+   sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::sin>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sin" });
 
@@ -188,7 +285,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_sin)
 
 // log1p
 MXNET_OPERATOR_REGISTER_UNARY(log1p)
-.MXNET_DESCRIBE("Take `log(1 + x)` in a numerically stable way")
+.describe(R"code(Calculate ``log(1 + x)``
+
+This function is more accurate than ``log(1 + x)``  for small ``x`` so that
+:math:`1+x\approx 1`
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log1p>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log1p"});
 
@@ -197,7 +299,11 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_log1p)
 
 // expm1
 MXNET_OPERATOR_REGISTER_UNARY(expm1)
-.MXNET_DESCRIBE("Take `exp(x) - 1` in a numerically stable way")
+.describe(R"code(Calculate ``exp(x) - 1``
+
+This function provides greater precision than ``exp(x) - 1`` for small values of ``x``.
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::expm1>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_expm1"});
 
@@ -206,7 +312,14 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_expm1)
 
 // cos
 MXNET_OPERATOR_REGISTER_UNARY(cos)
-.MXNET_DESCRIBE("Take cos of the src")
+.describe(R"code(Cosine, element-wise.
+
+Then input is in radians (:math:`2\pi` rad equals 360 degress).
+
+.. math::
+   cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::cos>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_cos"});
 
@@ -215,7 +328,14 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_cos)
 
 // tan
 MXNET_OPERATOR_REGISTER_UNARY(tan)
-.MXNET_DESCRIBE("Take tan of the src")
+.describe(R"code(Tangent, element-wise.
+
+Then input is in radians (:math:`2\pi` rad equals 360 degress).
+
+.. math::
+   tan([0, \pi/4, \pi/2]) = [0, 1, -inf]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::tan>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tan" });
 
@@ -224,7 +344,15 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_tan)
 
 // arcsin
 MXNET_OPERATOR_REGISTER_UNARY(arcsin)
-.MXNET_DESCRIBE("Take arcsin of the src")
+.describe(R"code(Inverse sine, element-wise.
+
+The input should be in range :math:`[-1, 1]`.
+The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+
+.. math::
+   arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arcsin>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsin" });
 
@@ -233,7 +361,15 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_arcsin)
 
 // arccos
 MXNET_OPERATOR_REGISTER_UNARY(arccos)
-.MXNET_DESCRIBE("Take arccos of the src")
+.describe(R"code(Inverse cosine, element-wise.
+
+The input should be in range :math:`[-1, 1]`.
+The output is in the closed interval :math:`[0, \pi]`
+
+.. math::
+   arccos([-1, -.707, 0, .707, 1]) = [\pi, 3\pi/4, \pi/2, \pi/4, 0]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arccos>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
 
@@ -242,7 +378,14 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_arccos)
 
 // arctan
 MXNET_OPERATOR_REGISTER_UNARY(arctan)
-.MXNET_DESCRIBE("Take arctan of the src")
+.describe(R"code(Inverse tangent, element-wise.
+
+The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+
+.. math::
+   arccos([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arctan>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctan" });
 
@@ -251,7 +394,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_arctan)
 
 // degrees
 MXNET_OPERATOR_REGISTER_UNARY(degrees)
-.MXNET_DESCRIBE("Take degrees of the src")
+.describe(R"code(Convert angles from radians to degrees.
+
+.. math::
+   degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::degrees>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_degrees" });
 
@@ -260,7 +408,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_degrees)
 
 // radians
 MXNET_OPERATOR_REGISTER_UNARY(radians)
-.MXNET_DESCRIBE("Take radians of the src")
+.describe(R"code(Convert angles from degrees to radians.
+
+.. math::
+   radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::radians>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_radians" });
 
@@ -269,7 +422,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_radians)
 
 // sinh
 MXNET_OPERATOR_REGISTER_UNARY(sinh)
-.MXNET_DESCRIBE("Take sinh of the src")
+.describe(R"code(Hyperbolic sine, element-wise.
+
+For example::
+   sinh(x) = 0.5\times(exp(x) - exp(-x))
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::sinh>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sinh" });
 
@@ -278,7 +436,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_sinh)
 
 // cosh
 MXNET_OPERATOR_REGISTER_UNARY(cosh)
-.MXNET_DESCRIBE("Take cosh of the src")
+.describe(R"code(Hyperbolic cosine, element-wise.
+
+For example::
+   cosh(x) = 0.5\times(exp(x) + exp(-x))
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::cosh>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_cosh" });
 
@@ -287,7 +450,12 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_cosh)
 
 // tanh
 MXNET_OPERATOR_REGISTER_UNARY(tanh)
-.MXNET_DESCRIBE("Take tanh of the src")
+.describe(R"code(Hyperbolic tangent element-wise.
+
+For example::
+   tanh(x) = sinh(x) / cosh(x)
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::tanh>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tanh" });
 
@@ -296,7 +464,8 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_tanh)
 
 // arcsinh
 MXNET_OPERATOR_REGISTER_UNARY(arcsinh)
-.MXNET_DESCRIBE("Take arcsinh of the src")
+.describe(R"code(Inverse hyperbolic sine, element-wise.
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arcsinh>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsinh" });
 
@@ -305,7 +474,8 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_arcsinh)
 
 // arccosh
 MXNET_OPERATOR_REGISTER_UNARY(arccosh)
-.MXNET_DESCRIBE("Take arccosh of the src")
+.describe(R"code(Inverse hyperbolic cosine, element-wise.
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arccosh>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccosh" });
 
@@ -314,7 +484,8 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_arccosh)
 
 // arctanh
 MXNET_OPERATOR_REGISTER_UNARY(arctanh)
-.MXNET_DESCRIBE("Take arctanh of the src")
+.describe(R"code(Inverse hyperbolic tangent, element-wise.
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arctanh>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctanh" });
 
@@ -323,7 +494,7 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_arctanh)
 
 // gamma
 MXNET_OPERATOR_REGISTER_UNARY(gamma)
-.MXNET_DESCRIBE("Take the gamma function (extension of the factorial function) of the src")
+.MXNET_DESCRIBE("The gamma function (extension of the factorial function), element-wise")
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::gamma>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_gamma"});
 
@@ -332,7 +503,7 @@ MXNET_OPERATOR_REGISTER_BINARY(_backward_gamma)
 
 // gammaln
 MXNET_OPERATOR_REGISTER_UNARY(gammaln)
-.MXNET_DESCRIBE("Take gammaln (log of the absolute value of gamma(x)) of the src")
+.MXNET_DESCRIBE("Log of the absolute value of the gamma function, element-wise")
 .set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::gammaln>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_gammaln"});
 
diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu
index 212a58cbf186..c8ce17757990 100644
--- a/src/operator/tensor/elemwise_unary_op.cu
+++ b/src/operator/tensor/elemwise_unary_op.cu
@@ -18,6 +18,9 @@ NNVM_REGISTER_OP(_backward_copy)
 NNVM_REGISTER_OP(BlockGrad)
 .set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
 
+NNVM_REGISTER_OP(make_loss)
+.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+
 // identity output as first input, but attributes are constrainted to be like rhs
 NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
 .set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 4188aaa6c765..7427588bcbfa 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -76,8 +76,8 @@ inline bool CastType(const nnvm::NodeAttrs& attrs,
                      std::vector<int> *in_attrs,
                      std::vector<int> *out_attrs) {
   const CastParam& param = nnvm::get<CastParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype);
   return (*in_attrs)[0] != -1;
 }
@@ -110,7 +110,7 @@ void CastCompute(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}};             \
     })                                                              \
-  .add_argument("data", "NDArray", "Source input")
+  .add_argument("data", "ndarray-or-symbol", "The input")
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/indexing_op-inl.cuh b/src/operator/tensor/indexing_op-inl.cuh
index c6728526c1c7..93a970a90ba6 100755
--- a/src/operator/tensor/indexing_op-inl.cuh
+++ b/src/operator/tensor/indexing_op-inl.cuh
@@ -42,7 +42,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
       //   blockDim.x = 32
       //   blockDim.y = 4
       //   sorted[idx_begin:] = [4 4 4 9]
-      //   (3,4) denotes threadIdx.x=3, threadIdx.y=4, ":" is used for ranges 
+      //   (3,4) denotes threadIdx.x=3, threadIdx.y=4, ":" is used for ranges
       //   (0:31,0:3) sorted_value = 4
       idx_end = idx_begin + 1;
       unsigned int* sh_ballot = (unsigned int*)sh_grad_weight_char;
@@ -154,7 +154,7 @@ __global__ void AddTakeGradLargeBatchKernel(DType* dst,
         }
       }
     }
-  
+
   }
 }
 
@@ -167,7 +167,7 @@ AddTakeGradLargeBatchWorkspaceSize(size_t num_keys) {
   size_t exclusivesum_bytes = 0;
   cub::DeviceScan::ExclusiveSum<IndexType*, IndexType*>(NULL, exclusivesum_bytes,
     NULL, NULL, num_keys);
-  size_t temporary_bytes = max(encode_bytes, exclusivesum_bytes);
+  size_t temporary_bytes = std::max(encode_bytes, exclusivesum_bytes);
   size_t unique_bytes = num_keys*sizeof(IndexType);
   size_t counts_bytes = num_keys*sizeof(IndexType);
   size_t num_runs_bytes = 1*sizeof(int);
@@ -202,17 +202,17 @@ inline void AddTakeGradLargeBatch(mshadow::Tensor<gpu, 2, DType> dst,
     size_t exclusivesum_bytes = 0;
     cub::DeviceScan::ExclusiveSum<IndexType*, IndexType*>
       (NULL, exclusivesum_bytes, NULL, NULL, sorted.size(0), stream);
-    size_t temporary_bytes = max(encode_bytes, exclusivesum_bytes);
+    size_t temporary_bytes = std::max(encode_bytes, exclusivesum_bytes);
 
     // Check that we have enough storage
-    CHECK_GE(workspace->size(0), unique_bytes + counts_bytes + 
+    CHECK_GE(workspace->size(0), unique_bytes + counts_bytes +
       num_runs_bytes + temporary_bytes);
 
     IndexType* unique_out_ptr = reinterpret_cast<IndexType*>(workspace->dptr_);
     IndexType* counts_out_ptr = reinterpret_cast<IndexType*>(workspace->dptr_ + unique_bytes);
     num_runs_ptr = reinterpret_cast<int*>(workspace->dptr_ + unique_bytes +
       counts_bytes);
-    void* temporary_storage = reinterpret_cast<void *>(workspace->dptr_ + unique_bytes + 
+    void* temporary_storage = reinterpret_cast<void *>(workspace->dptr_ + unique_bytes +
       counts_bytes + num_runs_bytes);
 
     cub::DeviceRunLengthEncode::Encode<IndexType*, IndexType*, IndexType*, int*>
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index a64eb78c2b40..c65457e85e64 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -32,10 +32,9 @@ NNVM_REGISTER_OP(Embedding)
   })
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::NodePtr& n,  const std::vector<nnvm::NodeEntry>& ograds) {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.push_back(n->inputs[0]);
-    return MakeGradNode("_backward_Embedding", n, heads, n->attrs.dict);
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    return MakeNonlossGradNode("_backward_Embedding", n, ograds,
+                               {n->inputs[0]}, n->attrs.dict);
   })
 .add_argument("data", "Symbol", "Input data to the EmbeddingOp.")
 .add_argument("weight", "Symbol", "Embedding weight matrix.")
@@ -53,11 +52,30 @@ NNVM_REGISTER_OP(_backward_Embedding)
 
 
 NNVM_REGISTER_OP(take)
-.MXNET_DESCRIBE("Take row vectors from an NDArray according to the indices"
-                " For an input of index with shape (d1, ..., dK), the output"
-                " shape is (d1, ..., dK, row_vector_length).All the input"
-                " values should be integers in the range"
-                " [0, column_vector_length).")
+.describe(R"code(Take elements from an array along an axis.
+
+Slice along a particular axis with the provided indices. E.g., given an input array
+with shape ``(d0, d1, d2)`` and indices with shape ``(i0, i1)``, then the output
+will have shape ``(i0, i1, d1, d2)``, with::
+
+  output[i,j,:,:] = input[indices[i,j],:,:]
+
+Examples::
+
+  x = [[ 1.,  2.],
+       [ 3.,  4.],
+       [ 5.,  6.]]
+
+ take(x, [[0,1],[1,2]]) = [[[ 1.,  2.],
+                            [ 3.,  4.]],
+
+                           [[ 3.,  4.],
+                            [ 5.,  6.]]]
+
+.. note::
+  Only slicing axis 0 is supported now.
+
+)code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr_parser(TakeParamParser<TakeParam>)
@@ -74,12 +92,11 @@ NNVM_REGISTER_OP(take)
 .set_attr<FCompute>("FCompute<cpu>", TakeOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n,  const std::vector<nnvm::NodeEntry>& ograds) {
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-    heads.push_back(n->inputs[1]);
-    return MakeGradNode("_backward_take", n, heads, n->attrs.dict);
+    return MakeNonlossGradNode("_backward_take", n, ograds,
+                               {n->inputs[1]}, n->attrs.dict);
   })
-.add_argument("a", "Symbol", "The source array.")
-.add_argument("indices", "Symbol", "The indices of the values to extract.")
+.add_argument("a", "ndarray-or-symbol", "The source array.")
+.add_argument("indices", "ndarray-or-symbol", "The indices of the values to extract.")
 .add_arguments(TakeParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_take)
@@ -94,10 +111,22 @@ NNVM_REGISTER_OP(_backward_take)
 
 
 NNVM_REGISTER_OP(batch_take)
-.MXNET_DESCRIBE(
-  "Take scalar value from a batch of data vectos according to "
-  "an index vector, i.e. out[i] = a[i, indices[i]]. out of bound "
-  "indices are clipped to boundary.")
+.describe(R"code(Take elements from a data batch.
+
+Given an ``(d0, d1)`` input array, and ``(d0,)`` indices, the output will be a
+``(d0,)`` computed by::
+
+  output[i] = input[i, indices[i]]
+
+Examples::
+
+  x = [[ 1.,  2.],
+       [ 3.,  4.],
+       [ 5.,  6.]]
+
+  batch_take(x, [0,1,0]) = [ 1.  4.  5.]
+
+)code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(2)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
@@ -107,17 +136,43 @@ NNVM_REGISTER_OP(batch_take)
 .set_attr<nnvm::FInferShape>("FInferShape", BatchTakeOpShape)
 .set_attr<nnvm::FInferType>("FInferType", BatchTakeOpType)
 .set_attr<FCompute>("FCompute<cpu>", BatchTakeOpForward<cpu>)
-.add_argument("a", "NDArray", "Input data array")
-.add_argument("indices", "NDArray", "index array");
+.add_argument("a", "ndarray-or-symbol", "Input data array")
+.add_argument("indices", "ndarray-or-symbol", "index array");
 
 NNVM_REGISTER_OP(one_hot)
-.MXNET_DESCRIBE("Given an ndarray indices filled with locations"
-                " indicating where to set on_value and depth,"
-                " return an output ndarray of shape (shape(indices), depth)."
-                " The off_value is marked everywhere else that"
-                " are not indicated in indices. If a location in the indices"
-                " is negative or greater than or equal to depth, assigning"
-                " on_value to that location will be ignored.")
+.describe(R"code(Returns a one-hot array.
+
+The locations represented by ``indices`` take value ``on_value``, while all
+other locations take value ``off_value``.
+
+Assume ``indices`` has shape ``(i0, i1)``, then the output will have shape
+``(i0, i1, depth)`` and::
+
+  output[i,j,:] = off_value
+  output[i,j,indices[i,j]] = on_value
+
+Examples::
+
+  one_hot([1,0,2,0], 3) = [[ 0.  1.  0.]
+                           [ 1.  0.  0.]
+                           [ 0.  0.  1.]
+                           [ 1.  0.  0.]]
+
+  one_hot([1,0,2,0], 3, on_value=8, off_value=1,
+          dtype='int32') = [[1 8 1]
+                            [8 1 1]
+                            [1 1 8]
+                            [8 1 1]]
+
+  one_hot([[1,0],[1,0],[2,0]], 3) = [[[ 0.  1.  0.]
+                                      [ 1.  0.  0.]]
+
+                                     [[ 0.  1.  0.]
+                                      [ 1.  0.  0.]]
+
+                                     [[ 0.  0.  1.]
+                                      [ 1.  0.  0.]]]
+)code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(1)
 .set_attr_parser(ParamParser<OneHotParam>)
@@ -129,7 +184,7 @@ NNVM_REGISTER_OP(one_hot)
 .set_attr<nnvm::FInferType>("FInferType", OneHotOpType)
 .set_attr<FCompute>("FCompute<cpu>", OneHotOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
-.add_argument("indices", "NDArray", "array of locations where to set on_value")
+.add_argument("indices", "ndarray-or-symbol", "array of locations where to set on_value")
 .add_arguments(OneHotParam::__FIELDS__());
 
 }  // namespace op
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 1d602281afaa..1ca9ed75cb0f 100755
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -122,7 +122,7 @@ inline bool EmbeddingOpShape(const nnvm::NodeAttrs& attrs,
 inline bool EmbeddingOpType(const nnvm::NodeAttrs& attrs,
                             std::vector<int> *in_type,
                             std::vector<int> *out_type) {
-  CHECK_GE(in_type->size(), 1);
+  CHECK_GE(in_type->size(), 1U);
   int dtype = (*in_type)[0];
   CHECK_NE(dtype, -1) << "First input must have specified type";
   for (index_t i = 0; i < in_type->size(); ++i) {
@@ -148,9 +148,9 @@ void EmbeddingOpForward(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   CHECK_EQ(req[embedding::kOut], kWriteTo);
-  CHECK_EQ(inputs.size(), 2);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(inputs[embedding::kWeight].ndim(), 2)
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(inputs[embedding::kWeight].ndim(), 2U)
           << "Embedding layer expects its weight to be two-dimensional. "
           << inputs[embedding::kWeight].ndim()
           << " dimensional input is given instead";
@@ -170,7 +170,7 @@ void EmbeddingOpForward(const nnvm::NodeAttrs& attrs,
 }
 
 // Returns integer log2(a) rounded up
-static int ilog2(unsigned int a) {
+inline int ilog2(unsigned int a) {
   int k = 1;
   while (a >>= 1) k++;
   return k;
@@ -222,8 +222,8 @@ void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs,
                          const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
-  CHECK_EQ(inputs.size(), 2);
-  CHECK_EQ(outputs.size(), 2);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U);
   CHECK_EQ(req[embedding::kData], kNullOp)
           << "Embedding layer doesn't support calculate data gradient";
 
@@ -284,7 +284,7 @@ struct TakeParam: public dmlc::Parameter<TakeParam> {
     .add_enum("raise", take_::kRaise)
     .add_enum("wrap", take_::kWrap)
     .add_enum("clip", take_::kClip)
-    .set_default(take_::kRaise)
+    .set_default(take_::kClip)
     .describe("specify how out-of-bound indices bahave.");
   }
 };
@@ -296,8 +296,8 @@ inline void TakeParamParser(nnvm::NodeAttrs *attrs) {
     if (param.axis != 0) {
         LOG(FATAL) << "Axis other than 0 currently not supported.";
     }
-    if (param.mode != take_::kRaise) {
-        LOG(FATAL) << "Mode other than raise currently not supported.";
+    if (param.mode != take_::kClip) {
+        LOG(FATAL) << "Mode other than clip currently not supported.";
     }
 }
 
@@ -323,58 +323,61 @@ inline bool TakeOpShape(const nnvm::NodeAttrs& attrs,
 }
 
 inline bool TakeOpType(const nnvm::NodeAttrs& attrs,
-                       std::vector<int> *in_type,
-                       std::vector<int> *out_type) {
-  // using single dtype ("float32") for safety reason
-  CHECK_GE(in_type->size(), 2);
-  int dtype = (*in_type)[1];
-  CHECK_NE(dtype, -1) << "idx must have specified type";
-  for (index_t i = 0; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype;
-    } else {
-      CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                     << "Expected " << dtype << " v.s. given "
-                                     << (*in_type)[i];
-    }
-  }
-  out_type->clear();
-  out_type->push_back(dtype);
-  return true;
+                       std::vector<int> *in_attrs,
+                       std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE((*in_attrs)[1], -1) << "Index type must be set for take operator";
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  return (*in_attrs)[0] != -1;
 }
 
+/*! \brief name the struct Take instead of take
+ * to avoid conflict with the take function in mshadow
+ */
+struct Take {
+  // assume that idx have been flattened to a 1-D tensor (N,)
+  // assume that out_data and in_data have been flattened to 2-D tensors, (N, M) and (K, M)
+  // M is the number of columns of in_data and out_data
+  // K is the number of rows of in_data
+  // i is the index of out_data
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+                                  const IType* idx, const int M, const int K) {
+    int j = static_cast<int>(idx[i/M]);
+    if (j <= 0) j = 0;
+    else if (j >= K) j = K - 1;
+    out_data[i] = in_data[j * M + i % M];
+  }
+};
+
 template<typename xpu>
 void TakeOpForward(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(req[take_::kOut], kWriteTo);
-    CHECK_EQ(inputs.size(), 2);
-    CHECK_EQ(outputs.size(), 1);
-    CHECK_GE(inputs[take_::kArr].ndim(), 2)
-        << "take layer expects its array's size to be at least 2. "
-        << inputs[take_::kArr].ndim()
-        << " dimensional input is given instead";
-
-    const TShape& idxshape = inputs[take_::kIdx].shape_;
-    const TShape& arrshape = inputs[take_::kArr].shape_;
-    const TShape& oshape = outputs[take_::kOut].shape_;
-
-    int idxndim = idxshape.ndim();
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-        Tensor<xpu, 1, DType> idx = inputs[take_::kIdx].get_with_shape<xpu, 1, DType>(
-            Shape1(idxshape.ProdShape(0, idxndim)), s);
-        Tensor<xpu, 2, DType> data = inputs[take_::kArr].get_with_shape<xpu, 2, DType>(
-            Shape2(arrshape[0], arrshape.ProdShape(1, arrshape.ndim())), s);
-        Tensor<xpu, 2, DType> out = outputs[take_::kOut].get_with_shape<xpu, 2, DType>(
-            Shape2(oshape.ProdShape(0, idxndim), oshape.ProdShape(idxndim, oshape.ndim())), s);
-        out = take(idx, data);
+  using namespace mxnet_op;
+  CHECK_EQ(req[take_::kOut], kWriteTo);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+
+  const TShape& idxshape = inputs[take_::kIdx].shape_;
+  const TShape& arrshape = inputs[take_::kArr].shape_;
+  const TShape& oshape = outputs[take_::kOut].shape_;
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output data type
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index data type
+      Kernel<Take, xpu>::Launch(s, oshape.Size(),
+                                outputs[take_::kOut].dptr<DType>(),
+                                inputs[take_::kArr].dptr<DType>(),
+                                inputs[take_::kIdx].dptr<IType>(),
+                                oshape.Size()/idxshape.Size(), arrshape[0]);
     });
+  });
 }
 
 template<typename xpu>
@@ -383,69 +386,72 @@ void TakeOpBackward(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(inputs.size(), 2);
-    CHECK_EQ(outputs.size(), 2);
-    CHECK_EQ(req[take_::kIdx], kNullOp)
-        << "take layer doesn't support gradient into index";
-
-    // inputs are specified in the .cc file, which are the gradients from
-    // the upper layer and the input index
-    // outputs are the gradients of inputs in the feed-forward pass
-    const TShape& idxshape = inputs[1].shape_;
-    const TShape& arrshape = outputs[0].shape_;
-    const TShape& oshape = inputs[0].shape_;
-
-    int idxndim = idxshape.ndim();
-
-    // grad_out is the gradient of the outputs in the feed-forward
-    // grad_in is the gradient of the inputs in the feed-forward
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-        Tensor<xpu, 1, DType> idx = inputs[1].get_with_shape<xpu, 1, DType>(
-            Shape1(idxshape.ProdShape(0, idxndim)), s);
-        Tensor<xpu, 2, DType> grad_out = inputs[0].get_with_shape<xpu, 2, DType>(
-            Shape2(oshape.ProdShape(0, idxndim), oshape.ProdShape(idxndim, oshape.ndim())), s);
-        Tensor<xpu, 2, DType> grad_in = outputs[0].get_with_shape<xpu, 2, DType>(
-            Shape2(arrshape[0], arrshape.ProdShape(1, arrshape.ndim())), s);
-
-        if (req[take_::kArr] == kWriteTo || req[take_::kArr] == kAddTo) {
-            if (req[take_::kArr] == kWriteTo) {
-                grad_in = scalar<DType>(0.0f);
-            }
-            // shape_out_prod ~= the number of elements loaded in AddTakeGrad
-            // shape_in_prod  ~= the number of elements stored in AddTakeGrad
-            // When the number of elements processed is low, use AddTakeGrad.
-            // The approximate cut-off value 16384 was found experimentally on Titan X Pascal
-            uint64_t shape_in_prod =
-              static_cast<uint64_t>(grad_in.shape_[0])*
-              static_cast<uint64_t>(grad_in.shape_[1]);
-            uint64_t shape_out_prod =
-              static_cast<uint64_t>(grad_out.shape_[0])*
-              static_cast<uint64_t>(grad_out.shape_[1]);
-            if (shape_out_prod < (uint64_t)16384 && shape_in_prod < (uint64_t)16384) {
-                AddTakeGrad(grad_in, idx, grad_out);
-            } else {
-                AddTakeGradLargeBatchCaller(ctx, grad_in, idx, grad_out);
-            }
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U);
+  CHECK_EQ(req[take_::kIdx], kNullOp)
+    << "take layer doesn't support gradient into index";
+
+  // inputs are specified in the .cc file, which are the gradients from
+  // the upper layer and the input index
+  // outputs are the gradients of inputs in the feed-forward pass
+  const TShape& idxshape = inputs[1].shape_;
+  const TShape& arrshape = outputs[0].shape_;
+  const TShape& oshape = inputs[0].shape_;
+
+  int idxndim = idxshape.ndim();
+
+  // grad_out is the gradient of the outputs in the feed-forward
+  // grad_in is the gradient of the inputs in the feed-forward
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output data type
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // index data type
+      Tensor<xpu, 1, IType> idx = inputs[1].get_with_shape<xpu, 1, IType>(
+          Shape1(idxshape.ProdShape(0, idxndim)), s);
+      Tensor<xpu, 2, DType> grad_out = inputs[0].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape.ProdShape(0, idxndim), oshape.ProdShape(idxndim, oshape.ndim())), s);
+      Tensor<xpu, 2, DType> grad_in = outputs[0].get_with_shape<xpu, 2, DType>(
+          Shape2(arrshape[0], arrshape.ProdShape(1, arrshape.ndim())), s);
+
+      if (req[take_::kArr] == kWriteTo || req[take_::kArr] == kAddTo) {
+        if (req[take_::kArr] == kWriteTo) {
+          grad_in = scalar<DType>(0.0f);
+        }
+        // shape_out_prod ~= the number of elements loaded in AddTakeGrad
+        // shape_in_prod  ~= the number of elements stored in AddTakeGrad
+        // When the number of elements processed is low, use AddTakeGrad.
+        // The approximate cut-off value 16384 was found experimentally on Titan X Pascal
+        uint64_t shape_in_prod =
+          static_cast<uint64_t>(grad_in.shape_[0])*
+          static_cast<uint64_t>(grad_in.shape_[1]);
+        uint64_t shape_out_prod =
+          static_cast<uint64_t>(grad_out.shape_[0])*
+          static_cast<uint64_t>(grad_out.shape_[1]);
+        if (shape_out_prod < (uint64_t)16384 && shape_in_prod < (uint64_t)16384) {
+          AddTakeGrad(grad_in, idx, grad_out);
         } else {
-            LOG(FATAL) << "wrong req";
+          AddTakeGradLargeBatchCaller(ctx, grad_in, idx, grad_out);
         }
+      } else {
+        LOG(FATAL) << "wrong req";
+      }
     });
+  });
 }
 
 inline bool BatchTakeOpShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2) << "BatchTake op requires two inputs";
+  LOG(INFO) << "batch_take is deprecated. Please use pick instead.";
+  CHECK_EQ(in_attrs->size(), 2U) << "BatchTake op requires two inputs";
   if ((*in_attrs)[1].ndim() != 0) {
     SHAPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[1]);
   } else if ((*out_attrs)[0].ndim() != 0) {
     SHAPE_ASSIGN_CHECK(*in_attrs, 1, (*out_attrs)[0]);
   }
   if ((*in_attrs)[0].ndim() == 0) return false;
-  CHECK_GE((*in_attrs)[0].ndim(), 2) << "Data array must have at least 2 dimensional";
+  CHECK_GE((*in_attrs)[0].ndim(), 2U) << "Data array must have at least 2 dimensional";
   if ((*out_attrs)[0].ndim() == 0) return false;
   CHECK_EQ((*in_attrs)[0].Size()/(*in_attrs)[0][(*in_attrs)[0].ndim()-1],
            (*out_attrs)[0].Size())
@@ -456,7 +462,7 @@ inline bool BatchTakeOpShape(const nnvm::NodeAttrs& attrs,
 inline bool BatchTakeOpType(const nnvm::NodeAttrs& attrs,
                           std::vector<int> *in_attrs,
                           std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2);
+  CHECK_EQ(in_attrs->size(), 2U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
@@ -485,9 +491,9 @@ void BatchTakeOpForward(const nnvm::NodeAttrs& attrs,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 2);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
   using namespace mxnet_op;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
@@ -541,8 +547,8 @@ inline bool OneHotOpShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
   const OneHotParam& param = nnvm::get<OneHotParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   // The shape of indices
   const TShape& ishape = (*in_attrs)[0];
 
@@ -564,26 +570,27 @@ inline bool OneHotOpShape(const nnvm::NodeAttrs& attrs,
 inline bool OneHotOpType(const nnvm::NodeAttrs& attrs,
                          std::vector<int>* in_attrs,
                          std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE((*in_attrs)[0], -1) << "Index type must be set for one_hot operator";
   int depth = 0;
   double on_value = 1.0;
   double off_value = 0.0;
-  int dtype = mshadow::kFloat32;
+  int dtype = -1;
   const OneHotParam& param = nnvm::get<OneHotParam>(attrs.parsed);
   GetOneHotParams(param, &depth, &on_value, &off_value, &dtype);
-  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt32);
-  TYPE_ASSIGN_CHECK(*out_attrs, 0, dtype);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, dtype);  // assign output type
+
   return true;
 }
 
 template<int req>
 struct one_hot {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const int* indices,
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* indices,
                                   int depth, DType on_value) {
     int offset = i * depth;
-    int j = indices[i];
+    int j = static_cast<int>(indices[i]);
     if (j >= 0 && j < depth) {
       KERNEL_ASSIGN(out[offset+j], req, on_value);
     }
@@ -596,9 +603,9 @@ void OneHotOpForward(const nnvm::NodeAttrs& attrs,
                      const std::vector<TBlob>& inputs,
                      const std::vector<OpReqType>& req,
                      const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
   // The following line is needed to guard the situation when
   // an output array is empty on GPU. In that case, out.dptr() = 0x0
   if (outputs[0].Size() == 0) return;
@@ -611,13 +618,15 @@ void OneHotOpForward(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {  // output data type switch
     mshadow::Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
     ASSIGN_DISPATCH(out, req[0], static_cast<DType>(off_value));
-    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-      Kernel<one_hot<req_type>, xpu>::Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
-                                             inputs[0].dptr<int>(), depth,
-                                             static_cast<DType>(on_value));
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {  // request type switch
+      MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {  // indices data type switch
+        Kernel<one_hot<req_type>, xpu>::Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
+                                               inputs[0].dptr<IType>(), depth,
+                                               static_cast<DType>(on_value));
+      });
     });
   });
 }
diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc
index 88937da2effb..8ee9a5b72f7d 100644
--- a/src/operator/tensor/init_op.cc
+++ b/src/operator/tensor/init_op.cc
@@ -4,6 +4,7 @@
  * \brief CPU Implementation of init op
  */
 #include "./init_op.h"
+#include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
@@ -42,5 +43,29 @@ NNVM_REGISTER_OP(_arange)
 .set_attr<FCompute>("FCompute<cpu>", RangeCompute<cpu>)
 .add_arguments(RangeParam::__FIELDS__());
 
+NNVM_REGISTER_OP(zeros_like)
+.MXNET_DESCRIBE("Return an array of zeros with the same shape and type as the input array.")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+    [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 0); })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("data", "ndarray-or-symbol", "The input");
+
+NNVM_REGISTER_OP(ones_like)
+.MXNET_DESCRIBE("Return an array of ones with the same shape and type as the input array.")
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+    [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 0); })
+.set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_argument("data", "ndarray-or-symbol", "The input");
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu
index 9fa116ac51fb..a798f26db60d 100644
--- a/src/operator/tensor/init_op.cu
+++ b/src/operator/tensor/init_op.cu
@@ -17,5 +17,11 @@ NNVM_REGISTER_OP(_ones)
 NNVM_REGISTER_OP(_arange)
 .set_attr<FCompute>("FCompute<gpu>", RangeCompute<gpu>);
 
+NNVM_REGISTER_OP(zeros_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+
+NNVM_REGISTER_OP(ones_like)
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 6cfed5505cf5..0b33277ba0d6 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -93,8 +93,8 @@ inline bool InitShape(const nnvm::NodeAttrs& attrs,
                       std::vector<TShape> *in_attrs,
                       std::vector<TShape> *out_attrs) {
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 0);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
   if ((*out_attrs)[0].ndim() != 0 && param.shape.ndim() == 0) return true;
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, param.shape);
   return true;
@@ -105,8 +105,8 @@ inline bool InitType(const nnvm::NodeAttrs& attrs,
                        std::vector<int> *in_attrs,
                        std::vector<int> *out_attrs) {
   const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 0);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TYPE_ASSIGN_CHECK(*out_attrs, 0, param.dtype);
   return true;
 }
@@ -152,9 +152,9 @@ inline bool RangeShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape> *in_attrs,
                        std::vector<TShape> *out_attrs) {
   const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 0);
-  CHECK_EQ(out_attrs->size(), 1);
-  CHECK_NE(param.step, 0)
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE(param.step, 0U)
     << "Range does not support step=0, received " << param.step;
   CHECK(param.repeat > 0)
     << "Range only supports repeat > 0, received " << param.repeat;
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index fb26e9a7453b..3f9f4efc34ba 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -1,6 +1,6 @@
 /*!
  *  Copyright (c) 2015 by Contributors
- * \file broadcast_reduce_op-inl.h
+ * \file matrix_op-inl.h
  * \brief Function defintion of matrix related operators
  */
 #ifndef MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
@@ -15,6 +15,10 @@
 #include "../mxnet_op.h"
 #include "broadcast_reduce_op.h"
 
+#if MXNET_USE_CUDA
+#include <thrust/device_vector.h>
+#endif
+
 namespace mxnet {
 namespace op {
 
@@ -25,39 +29,21 @@ struct ReshapeParam : public dmlc::Parameter<ReshapeParam> {
   bool reverse;
   DMLC_DECLARE_PARAMETER(ReshapeParam) {
     int tmp[] = {0, 0};
-    DMLC_DECLARE_FIELD(target_shape)
-    .set_default(TShape(tmp, tmp + 2))
-    .describe("(Deprecated! Use shape instead.) Target new shape. One and only one dim can be 0, "
-              "in which case it will be inferred from the rest of dims");
+    DMLC_DECLARE_FIELD(shape)
+    .set_default(nnvm::Tuple<int>())
+    .describe("The target shape");
     DMLC_DECLARE_FIELD(keep_highest).set_default(false)
-    .describe("(Deprecated! Use shape instead.) Whether keep the highest dim unchanged."
+    .describe("(Deprecated! Use ``shape`` instead.) Whether keep the highest dim unchanged."
               "If set to true, then the first dim in target_shape is ignored,"
               "and always fixed as input");
-    DMLC_DECLARE_FIELD(shape)
-    .set_default(nnvm::Tuple<int>())
-    .describe("Target shape, a tuple, t=(t_1,t_2,..,t_m).\n"
-              "Let the input dims be s=(s_1,s_2,..,s_n).\n"
-              "The output dims u=(u_1,u_2,..,u_p) are computed from s and t.\n"
-              "The target shape tuple elements t_i are read in order, and used to "
-              " generate successive output dims u_p:\n"
-              "t_i:       meaning:      behavior:\n"
-              "+ve        explicit      u_p = t_i\n"
-              "0          copy          u_p = s_i\n"
-              "-1         infer         u_p = (Prod s_i) / (Prod u_j | j != p)\n"
-              "-2         copy all      u_p = s_i, u_p+1 = s_i+1, ...\n"
-              "-3         merge two     u_p = s_i * s_i+1\n"
-              "-4,a,b     split two     u_p = a, u_p+1 = b | a * b = s_i\n"
-              "The split directive (-4) in the target shape tuple is followed by "
-              "two dimensions, one of which can be -1, which means it will be "
-              "inferred from the other one and the original dimension.\n"
-              "The can only be one globally inferred dimension (-1), aside from "
-              "any -1 occuring in a split directive.");
     DMLC_DECLARE_FIELD(reverse)
-      .set_default(false)
-      .describe("Whether to match the shapes from the backward. If reverse is true, "
-      "0 values in the `shape` argument will be searched from the backward. E.g the "
-      "original shape is (10, 5, 4) and the shape argument is (-1, 0). If reverse is true, "
-      "the new shape should be (50, 4). Otherwise it will be (40, 5).");
+    .set_default(false)
+    .describe("If true then translating the input shape from right to left");
+    DMLC_DECLARE_FIELD(target_shape)
+    .set_default(TShape(tmp, tmp + 2))
+    .describe("(Deprecated! Use ``shape`` instead.) "
+              "Target new shape. One and only one dim can be 0, "
+              "in which case it will be inferred from the rest of dims");
   }
 };
 
@@ -65,8 +51,8 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
   const ReshapeParam& param_ = nnvm::get<ReshapeParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1) << "Input: [data]";
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ(param_.target_shape.ndim() > 0 ||
            param_.shape.ndim() > 0, true) << "targe_shape or shape must be present.";
   const TShape &dshape = (*in_attrs)[0];
@@ -122,7 +108,7 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
         CHECK(d1 != -1 || d2 != -1) << "Split dims cannot both be -1.";
         if (d1 == -1) d1 = d0 / d2;
         if (d2 == -1) d2 = d0 / d1;
-        CHECK_EQ(d1 * d2, d0) <<
+        CHECK_EQ(d1 * d2, static_cast<int>(d0)) <<
           "Split dims " << d1 << ", " << d2 << " do not divide original dim " << d0;
         tmp.push_back(d1);
         tmp.push_back(d2);
@@ -187,8 +173,8 @@ inline bool ReshapeShape(const nnvm::NodeAttrs& attrs,
 inline bool FlattenShape(const nnvm::NodeAttrs& attrs,
                          std::vector<TShape> *in_attrs,
                          std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1) << "Input: [data]";
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U) << "Input: [data]";
+  CHECK_EQ(out_attrs->size(), 1U);
   const TShape &dshape = (*in_attrs)[0];
   if (dshape.ndim() == 0) return false;
   out_attrs->clear();
@@ -286,10 +272,10 @@ inline bool TransposeShape(const nnvm::NodeAttrs& attrs,
                              std::vector<TShape> *in_attrs,
                              std::vector<TShape> *out_attrs) {
   const TransposeParam& param = nnvm::get<TransposeParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TShape& shp = (*in_attrs)[0];
-  CHECK_LE(shp.ndim(), 5) << "Transpose support at most 5 dimensions";
+  CHECK_LE(shp.ndim(), 5U) << "Transpose support at most 5 dimensions";
   TShape ret(shp.ndim());
   if (param.axes.ndim() == 0) {
     for (index_t i = 0; i < shp.ndim(); ++i) {
@@ -320,8 +306,8 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
                            std::vector<TShape> *in_attrs,
                            std::vector<TShape> *out_attrs) {
   const ExpandDimParam& param = nnvm::get<ExpandDimParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TShape& shp = (*in_attrs)[0];
   CHECK_LE(param.axis, shp.ndim())
       << "axis exceeds the dimension of the array";
@@ -338,10 +324,10 @@ struct DotParam : public dmlc::Parameter<DotParam> {
   bool transpose_b;
   DMLC_DECLARE_PARAMETER(DotParam) {
     DMLC_DECLARE_FIELD(transpose_a)
-      .describe("True if the first matrix is transposed.")
+      .describe("If true then transpose the first input before dot.")
       .set_default(false);
     DMLC_DECLARE_FIELD(transpose_b)
-      .describe("True if the second matrix is tranposed.")
+      .describe("If true then transpose the second input before dot.")
       .set_default(false);
   }
 };
@@ -494,8 +480,8 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
                      std::vector<TShape> *in_attrs,
                      std::vector<TShape> *out_attrs) {
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 2);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TShape& lshape = (*in_attrs)[0];
   TShape& rshape = (*in_attrs)[1];
   if (lshape.ndim() == 1 && rshape.ndim() == 1) {
@@ -520,8 +506,10 @@ inline bool DotShape(const nnvm::NodeAttrs& attrs,
       R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1);
     }
 
-    CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
-      << "dot shape error: " << lshape << " X " << rshape;
+    if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
+      CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
+        << "dot shape error: " << lshape << " X " << rshape;
+    }
     std::vector<index_t> buf;
     if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
     if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
@@ -657,8 +645,8 @@ void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
 inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   TShape& lshape = (*in_attrs)[0];
   TShape& rshape = (*in_attrs)[1];
@@ -681,10 +669,9 @@ inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-
-struct SimpleCropParam : public dmlc::Parameter<SimpleCropParam> {
-  TShape begin, end;
-  DMLC_DECLARE_PARAMETER(SimpleCropParam) {
+struct SliceParam : public dmlc::Parameter<SliceParam> {
+  nnvm::Tuple<dmlc::optional<int> > begin, end;
+  DMLC_DECLARE_PARAMETER(SliceParam) {
     DMLC_DECLARE_FIELD(begin)
     .describe("starting coordinates");
     DMLC_DECLARE_FIELD(end)
@@ -692,28 +679,71 @@ struct SimpleCropParam : public dmlc::Parameter<SimpleCropParam> {
   }
 };
 
-
-struct ClipParam : public dmlc::Parameter<ClipParam> {
-  real_t a_min, a_max;
-  DMLC_DECLARE_PARAMETER(ClipParam) {
-    DMLC_DECLARE_FIELD(a_min)
-    .describe("Minimum value");
-    DMLC_DECLARE_FIELD(a_max)
-    .describe("Maximum value");
+inline TShape GetSliceShape(const SliceParam& param, const TShape& dshape) {
+  CHECK_LE(param.begin.ndim(), dshape.ndim())
+    << "Slicing axis exceeds data dimensions";
+  CHECK_LE(param.end.ndim(), dshape.ndim())
+    << "Slicing axis exceeds data dimensions";
+
+  TShape oshape(dshape.ndim());
+  for (index_t i = 0; i < dshape.ndim(); ++i) {
+    int s = 0, e = dshape[i];
+    if (e != 0) {
+      if (param.begin[i]) {
+        CHECK_LE(*param.begin[i], e)
+          << "Slicing begin exceeds data dimensions "
+          << param.begin << " vs " << dshape;
+        s = *param.begin[i];
+        if (s < 0) s += dshape[i];
+      }
+      if (param.end[i]) {
+        CHECK_LE(*param.end[i], e)
+          << "Slicing end exceeds data dimensions "
+          << param.end << " vs " << dshape;
+        e = *param.end[i];
+        if (e < 0) e += dshape[i];
+      }
+      CHECK(s >= 0 && s < e && e <= static_cast<int>(dshape[i]))
+        << "Invalid slicing begin " << param.begin << " and end "
+        << param.end << " for data of shape " << dshape;
+    }
+    oshape[i] = e - s;
   }
-};
+  return oshape;
+}
+
+inline bool SliceShape(const nnvm::NodeAttrs& attrs,
+                       std::vector<TShape> *in_attrs,
+                       std::vector<TShape> *out_attrs) {
+  const TShape& dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0) return false;
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, GetSliceShape(param, dshape));
+  return true;
+}
 
 // matrix crop for multi dimensional cropping: see also slice
 template<typename xpu>
-void Crop(const nnvm::NodeAttrs& attrs,
+void Slice(const nnvm::NodeAttrs& attrs,
           const OpContext& ctx,
           const std::vector<TBlob>& inputs,
           const std::vector<OpReqType>& req,
           const std::vector<TBlob>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
-  const SimpleCropParam& param = nnvm::get<SimpleCropParam>(attrs.parsed);
-  CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  index_t N = inputs[0].ndim();
+  TShape begin(N), end(N);
+  for (index_t i = 0; i < N; ++i) {
+    int s = 0;
+    if (param.begin[i]) {
+      s = *param.begin[i];
+      if (s < 0) s += inputs[0].size(i);
+    }
+    begin[i] = s;
+    end[i] = s + outputs[0].size(i);
+  }
+
   Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     switch (inputs[0].ndim()) {
@@ -722,31 +752,31 @@ void Crop(const nnvm::NodeAttrs& attrs,
      case 1: {
       Tensor<xpu, 1, DType> in = inputs[0].get<xpu, 1, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-      out = slice(in, param.begin.get<1>(), param.end.get<1>());
+      out = slice(in, begin.get<1>(), end.get<1>());
       break;
      }
      case 2: {
       Tensor<xpu, 2, DType> in = inputs[0].get<xpu, 2, DType>(s);
       Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
-      out = slice(in, param.begin.get<2>(), param.end.get<2>());
+      out = slice(in, begin.get<2>(), end.get<2>());
       break;
      }
      case 3: {
       Tensor<xpu, 3, DType> in = inputs[0].get<xpu, 3, DType>(s);
       Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-      out = slice(in, param.begin.get<3>(), param.end.get<3>());
+      out = slice(in, begin.get<3>(), end.get<3>());
       break;
      }
      case 4: {
       Tensor<xpu, 4, DType> in = inputs[0].get<xpu, 4, DType>(s);
       Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
-      out = slice(in, param.begin.get<4>(), param.end.get<4>());
+      out = slice(in, begin.get<4>(), end.get<4>());
       break;
      }
      case 5: {
       Tensor<xpu, 5, DType> in = inputs[0].get<xpu, 5, DType>(s);
       Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
-      out = slice(in, param.begin.get<5>(), param.end.get<5>());
+      out = slice(in, begin.get<5>(), end.get<5>());
       break;
      }
      default:
@@ -756,115 +786,66 @@ void Crop(const nnvm::NodeAttrs& attrs,
   });
 }
 
-inline bool CropShape(const nnvm::NodeAttrs& attrs,
-                      std::vector<TShape> *in_attrs,
-                      std::vector<TShape> *out_attrs) {
-  const SimpleCropParam& param = nnvm::get<SimpleCropParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
-  TShape& shp = (*in_attrs)[0];
-  CHECK_EQ(shp.ndim(), param.begin.ndim());
-  CHECK_EQ(shp.ndim(), param.end.ndim());
-  TShape ret(shp.ndim());
-  for (index_t i = 0; i < shp.ndim(); ++i) {
-    CHECK(param.begin[i] < shp[i]
-          && param.end[i] <= shp[i]
-          && param.begin[i] < param.end[i]);
-    ret[i] = param.end[i] - param.begin[i];
-  }
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, ret);
+inline bool SliceAssignShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape> *in_attrs,
+                             std::vector<TShape> *out_attrs) {
+  const TShape& lshape = (*in_attrs)[0];
+  if (lshape.ndim() == 0) return false;
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, GetSliceShape(param, lshape));
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, lshape);
   return true;
 }
 
 template<typename xpu>
-void Clip(const nnvm::NodeAttrs& attrs,
-          const OpContext& ctx,
-          const std::vector<TBlob>& inputs,
-          const std::vector<OpReqType>& req,
-          const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
-  CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Kernel<clip, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
-    inputs[0].dptr<DType>(), DType(param.a_min), DType(param.a_max));
-  });
-}
-
-template<typename xpu>
-void ClipGrad_(const nnvm::NodeAttrs& attrs,
-               const OpContext& ctx,
-               const std::vector<TBlob>& inputs,
-               const std::vector<OpReqType>& req,
-               const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
-  CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Kernel<clip_grad, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
-    inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), DType(param.a_min), DType(param.a_max));
-  });
-}
-
-template<typename xpu>
-void CropAssign(const nnvm::NodeAttrs& attrs,
-                const OpContext& ctx,
-                const std::vector<TBlob>& inputs,
-                const std::vector<OpReqType>& req,
-                const std::vector<TBlob>& outputs) {
+void SliceAssignImpl(mshadow::Stream<xpu> *s, const SliceParam& param,
+                     const TBlob& dst, const TBlob& src) {
   using namespace mshadow;
   using namespace mshadow::expr;
-
-  const SimpleCropParam& param = nnvm::get<SimpleCropParam>(attrs.parsed);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  if (req[0] == kWriteTo) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
-      Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-      Copy(out, in, s);
-    });
-  } else if (req[0] != kWriteInplace) {
-    LOG(FATAL) << "CropAssignScalar only supports kWriteTo and kWriteInplace";
+  index_t N = dst.ndim();
+  TShape begin(N), end(N);
+  for (index_t i = 0; i < N; ++i) {
+    int s = 0;
+    if (param.begin[i]) {
+      s = *param.begin[i];
+      if (s < 0) s += dst.size(i);
+    }
+    begin[i] = s;
+    end[i] = s + src.size(i);
   }
 
-  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-    switch (outputs[0].shape_.ndim()) {
+  MSHADOW_TYPE_SWITCH(dst.type_flag_, DType, {
+    switch (dst.ndim()) {
       case 0:
         break;
       case 1: {
-        Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-        Tensor<xpu, 1, DType> in = inputs[1].get<xpu, 1, DType>(s);
-        slice(out, param.begin.get<1>(), param.end.get<1>()) = in;
+        Tensor<xpu, 1, DType> out = dst.get<xpu, 1, DType>(s);
+        Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
+        slice(out, begin.get<1>(), end.get<1>()) = in;
         break;
       }
       case 2: {
-        Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
-        Tensor<xpu, 2, DType> in = inputs[1].get<xpu, 2, DType>(s);
-        slice(out, param.begin.get<2>(), param.end.get<2>()) = in;
+        Tensor<xpu, 2, DType> out = dst.get<xpu, 2, DType>(s);
+        Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
+        slice(out, begin.get<2>(), end.get<2>()) = in;
         break;
       }
       case 3: {
-        Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-        Tensor<xpu, 3, DType> in = inputs[1].get<xpu, 3, DType>(s);
-        slice(out, param.begin.get<3>(), param.end.get<3>()) = in;
+        Tensor<xpu, 3, DType> out = dst.get<xpu, 3, DType>(s);
+        Tensor<xpu, 3, DType> in = src.get<xpu, 3, DType>(s);
+        slice(out, begin.get<3>(), end.get<3>()) = in;
         break;
       }
       case 4: {
-        Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
-        Tensor<xpu, 4, DType> in = inputs[1].get<xpu, 4, DType>(s);
-        slice(out, param.begin.get<4>(), param.end.get<4>()) = in;
+        Tensor<xpu, 4, DType> out = dst.get<xpu, 4, DType>(s);
+        Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
+        slice(out, begin.get<4>(), end.get<4>()) = in;
         break;
       }
       case 5: {
-        Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
-        Tensor<xpu, 5, DType> in = inputs[1].get<xpu, 5, DType>(s);
-        slice(out, param.begin.get<5>(), param.end.get<5>()) = in;
+        Tensor<xpu, 5, DType> out = dst.get<xpu, 5, DType>(s);
+        Tensor<xpu, 5, DType> in = src.get<xpu, 5, DType>(s);
+        slice(out, begin.get<5>(), end.get<5>()) = in;
         break;
       }
       default:
@@ -874,24 +855,58 @@ void CropAssign(const nnvm::NodeAttrs& attrs,
   });
 }
 
-inline bool CropAssignShape(const nnvm::NodeAttrs& attrs,
-                            std::vector<TShape> *in_attrs,
-                            std::vector<TShape> *out_attrs) {
-  const SimpleCropParam& param = nnvm::get<SimpleCropParam>(attrs.parsed);
-  TShape& lshape = (*in_attrs)[0];
-  TShape& rshape = (*in_attrs)[1];
-  CHECK_EQ(lshape.ndim(), rshape.ndim());
-  CHECK_EQ(lshape.ndim(), param.begin.ndim());
-  CHECK_EQ(lshape.ndim(), param.end.ndim());
-  for (index_t i = 0; i < rshape.ndim(); ++i) {
-    CHECK_LT(param.begin[i], param.end[i]);
-    CHECK_LE(param.end[i], lshape[i]);
-    CHECK_EQ(param.end[i] - param.begin[i], rshape[i]);
+template<typename xpu>
+void SliceAssign(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const std::vector<TBlob>& inputs,
+                 const std::vector<OpReqType>& req,
+                 const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  if (req[0] == kNullOp) {
+    return;
+  } else if (req[0] == kWriteTo) {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
+      Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
+      Copy(out, in, s);
+    });
+  } else if (req[0] != kWriteInplace) {
+    LOG(FATAL) << "CropAssign only supports kWriteTo and kWriteInplace";
   }
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, lshape);
-  return true;
+
+  SliceAssignImpl<xpu>(s, param, outputs[0], inputs[1]);
 }
 
+template<typename xpu>
+void SliceBackward(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  if (req[0] == kNullOp) {
+    return;
+  } else if (req[0] == kWriteTo) {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
+      out = DType(0);
+    });
+  } else {
+    LOG(FATAL) << "CropAssign only supports kWriteTo";
+  }
+
+  SliceAssignImpl<xpu>(s, param, outputs[0], inputs[0]);
+}
 
 struct SimpleCropAssignScalarParam : public dmlc::Parameter<SimpleCropAssignScalarParam> {
   real_t scalar;
@@ -984,11 +999,11 @@ inline bool CropAssignScalarShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-struct SliceParam : public dmlc::Parameter<SliceParam> {
+struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
   int axis;
   int begin;
   dmlc::optional<int> end;
-  DMLC_DECLARE_PARAMETER(SliceParam) {
+  DMLC_DECLARE_PARAMETER(SliceAxisParam) {
     DMLC_DECLARE_FIELD(axis)
       .describe("The axis to be sliced."
                 " Negative axis means to count from the last to the first axis.");
@@ -1002,7 +1017,7 @@ struct SliceParam : public dmlc::Parameter<SliceParam> {
   }
 };
 
-inline void GetSliceParams(const SliceParam& param, const TShape& ishape,
+inline void GetSliceAxisParams(const SliceAxisParam& param, const TShape& ishape,
                            int* axis, int* begin, int* end) {
   *axis = param.axis;
   if (*axis < 0) {
@@ -1031,15 +1046,15 @@ inline void GetSliceParams(const SliceParam& param, const TShape& ishape,
     << "Invalid begin, end, get begin=" << param.begin << ", end=" << param.end;
 }
 
-inline bool SliceShape(const nnvm::NodeAttrs& attrs,
+inline bool SliceAxisShape(const nnvm::NodeAttrs& attrs,
                        std::vector<TShape> *in_attrs,
                        std::vector<TShape> *out_attrs) {
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   TShape& ishape = (*in_attrs)[0];
   int axis, begin, end;
-  GetSliceParams(param, ishape, &axis, &begin, &end);
+  GetSliceAxisParams(param, ishape, &axis, &begin, &end);
   TShape shape(ishape.ndim());
   for (index_t i = 0; i < ishape.ndim(); ++i) {
     if (static_cast<int>(i) == axis) {
@@ -1054,16 +1069,16 @@ inline bool SliceShape(const nnvm::NodeAttrs& attrs,
 
 
 template<typename xpu>
-void Slice(const nnvm::NodeAttrs& attrs,
+void SliceAxis(const nnvm::NodeAttrs& attrs,
            const OpContext& ctx,
            const std::vector<TBlob>& inputs,
            const std::vector<OpReqType>& req,
            const std::vector<TBlob>& outputs) {
   using namespace mshadow::expr;
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int axis, begin, end;
-  GetSliceParams(param, inputs[0].shape_, &axis, &begin, &end);
+  GetSliceAxisParams(param, inputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].ndim());
 
   if (axis + 1 == ndim) {
@@ -1087,17 +1102,17 @@ void Slice(const nnvm::NodeAttrs& attrs,
 
 // Backward pass of broadcast over the given axis
 template<typename xpu>
-void SliceGrad_(const nnvm::NodeAttrs& attrs,
+void SliceAxisGrad_(const nnvm::NodeAttrs& attrs,
                 const OpContext& ctx,
                 const std::vector<TBlob>& inputs,
                 const std::vector<OpReqType>& req,
                 const std::vector<TBlob>& outputs) {
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  const SliceAxisParam& param = nnvm::get<SliceAxisParam>(attrs.parsed);
   using namespace mshadow::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   int axis, begin, end;
-  GetSliceParams(param, outputs[0].shape_, &axis, &begin, &end);
+  GetSliceAxisParams(param, outputs[0].shape_, &axis, &begin, &end);
   int ndim = static_cast<int>(outputs[0].shape_.ndim());
 
   if (axis + 1 == ndim) {
@@ -1133,64 +1148,81 @@ void SliceGrad_(const nnvm::NodeAttrs& attrs,
   }
 }
 
-struct FlipParam : public dmlc::Parameter<FlipParam> {
-  int axis;
-  DMLC_DECLARE_PARAMETER(FlipParam) {
-    DMLC_DECLARE_FIELD(axis)
-    .describe("The dimension to flip");
+struct ClipParam : public dmlc::Parameter<ClipParam> {
+  real_t a_min, a_max;
+  DMLC_DECLARE_PARAMETER(ClipParam) {
+    DMLC_DECLARE_FIELD(a_min)
+    .describe("Minimum value");
+    DMLC_DECLARE_FIELD(a_max)
+    .describe("Maximum value");
   }
 };
 
-// matrix crop
+
+struct clip {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* datas,
+                                  DType a_min, DType a_max) {
+    DType data = datas[i];
+    if (data > a_max) {
+      out[i] = a_max;
+    } else if (data < a_min) {
+      out[i] = a_min;
+    } else {
+      out[i] = data;
+    }
+  }
+};
+
+
+struct clip_grad {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* grad, const DType* datas,
+                                  DType a_min, DType a_max) {
+    DType data = datas[i];
+    if (data > a_max) {
+      out[i] = 0;
+    } else if (data < a_min) {
+      out[i] = 0;
+    } else {
+      out[i] = grad[i];
+    }
+  }
+};
+
+
 template<typename xpu>
-void Flip(const nnvm::NodeAttrs& attrs,
+void Clip(const nnvm::NodeAttrs& attrs,
           const OpContext& ctx,
           const std::vector<TBlob>& inputs,
           const std::vector<OpReqType>& req,
           const std::vector<TBlob>& outputs) {
-  const FlipParam& param = nnvm::get<FlipParam>(attrs.parsed);
   using namespace mshadow;
-  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();
+
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    switch (inputs[0].shape_.ndim()) {
-     case 0:
-      break;
-     case 1: {
-      Tensor<xpu, 1, DType> in = inputs[0].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-      out = flip(in, param.axis);
-      break;
-     }
-     case 2: {
-      Tensor<xpu, 2, DType> in = inputs[0].get<xpu, 2, DType>(s);
-      Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
-      out = flip(in, param.axis);
-      break;
-     }
-     case 3: {
-      Tensor<xpu, 3, DType> in = inputs[0].get<xpu, 3, DType>(s);
-      Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-      out = flip(in, param.axis);
-      break;
-     }
-     case 4: {
-      Tensor<xpu, 4, DType> in = inputs[0].get<xpu, 4, DType>(s);
-      Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
-      out = flip(in, param.axis);
-      break;
-     }
-     case 5: {
-      Tensor<xpu, 5, DType> in = inputs[0].get<xpu, 5, DType>(s);
-      Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
-      out = flip(in, param.axis);
-      break;
-     }
-     default:
-      LOG(FATAL) << "flip supports at most 5 dimensions";
-      break;
-    }
+    Kernel<clip, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
+    inputs[0].dptr<DType>(), DType(param.a_min), DType(param.a_max));
+  });
+}
+
+template<typename xpu>
+void ClipGrad_(const nnvm::NodeAttrs& attrs,
+               const OpContext& ctx,
+               const std::vector<TBlob>& inputs,
+               const std::vector<OpReqType>& req,
+               const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
+  CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Kernel<clip_grad, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
+    inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), DType(param.a_min), DType(param.a_max));
   });
 }
 
@@ -1238,8 +1270,8 @@ inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
                         std::vector<TShape> *in_attrs,
                         std::vector<TShape> *out_attrs) {
   const RepeatParam& param = nnvm::get<RepeatParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   const TShape& ishape = (*in_attrs)[0];
   int repeats = 0;
   dmlc::optional<int> axisOpt;
@@ -1277,7 +1309,7 @@ inline bool RepeatOpShape(const nnvm::NodeAttrs& attrs,
 inline bool RepeatOpType(const nnvm::NodeAttrs& attrs,
                          std::vector<int>* in_attrs,
                          std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
@@ -1383,8 +1415,8 @@ void RepeatOpBackward(const nnvm::NodeAttrs& attrs,
                       const std::vector<TBlob>& inputs,
                       const std::vector<OpReqType>& req,
                       const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
 
   const TShape& oshape = outputs[0].shape_;
   if (oshape.ndim() == 0) return;
@@ -1424,8 +1456,8 @@ struct TileParam : public dmlc::Parameter<TileParam> {
 inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
                         std::vector<TShape> *in_attrs,
                         std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
-  CHECK_EQ(out_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
   const TileParam& param = nnvm::get<TileParam>(attrs.parsed);
   const TShape& ishape = (*in_attrs)[0];
   const TShape& reps = param.reps;
@@ -1453,7 +1485,7 @@ inline bool TileOpShape(const nnvm::NodeAttrs& attrs,
 inline bool TileOpType(const nnvm::NodeAttrs& attrs,
                        std::vector<int>* in_attrs,
                        std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
   if ((*in_attrs)[0] != -1) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
   } else if ((*out_attrs)[0] != -1) {
@@ -1518,8 +1550,8 @@ void TileOpForward(const nnvm::NodeAttrs& attrs,
                    const std::vector<TBlob>& inputs,
                    const std::vector<OpReqType>& req,
                    const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
 
   if (inputs[0].Size() == 0) return;
   const TShape& ishape = inputs[0].shape_;
@@ -1558,8 +1590,8 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
                     const std::vector<TBlob>& inputs,
                     const std::vector<OpReqType>& req,
                     const std::vector<TBlob>& outputs) {
-  CHECK_EQ(inputs.size(), 1);
-  CHECK_EQ(outputs.size(), 1);
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
 
   if (inputs[0].Size() == 0) return;
   const TShape& oshape = outputs[0].shape_;
@@ -1583,6 +1615,120 @@ void TileOpBackward(const nnvm::NodeAttrs& attrs,
       attrs, ctx, newInputs, req, newOutputs, rshapes.first);
 }
 
+struct ReverseParam : public dmlc::Parameter<ReverseParam> {
+  nnvm::Tuple<int> axis;
+  DMLC_DECLARE_PARAMETER(ReverseParam) {
+    DMLC_DECLARE_FIELD(axis)
+    .describe("The axis which to reverse elements.");
+  }
+};
+
+
+#define REVERSE_MAX_DIM 10U
+
+struct reverse {
+  MSHADOW_XINLINE static int ReverseIndex(index_t idx,
+                                          index_t nreversedim,
+                                          const index_t * stride_,
+                                          const index_t * trailing_) {
+    index_t outputIndex = idx;
+    for (index_t i = 0; i < nreversedim; ++i) {
+      const index_t low = outputIndex % trailing_[i];
+      index_t high = outputIndex / trailing_[i];
+      const index_t x = high%stride_[i];
+      high /= stride_[i];
+      outputIndex = (high*stride_[i] + stride_[i] - 1 - x)*trailing_[i] + low;
+    }
+    return outputIndex;
+  }
+#ifdef __CUDACC__
+  template<typename DType>
+  __device__  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+                              const index_t * stride_,
+                              const index_t * trailing_) {
+    __shared__ index_t stride_share[REVERSE_MAX_DIM];
+    __shared__ index_t trailing_share[REVERSE_MAX_DIM];
+    if (threadIdx.x < REVERSE_MAX_DIM) {
+      stride_share[threadIdx.x] = stride_[threadIdx.x];
+      trailing_share[threadIdx.x] = trailing_[threadIdx.x];
+    }
+    __syncthreads();
+    index_t new_idx = ReverseIndex(index, nreversedim, stride_share, trailing_share);
+    dst[new_idx] = src[index];
+  }
+#else
+  template<typename DType>
+  MSHADOW_XINLINE  static void Map(int index, index_t nreversedim, const DType *src, DType *dst,
+                                   const index_t * stride_,
+                                   const index_t * trailing_) {
+    index_t new_idx = ReverseIndex(index, nreversedim, stride_, trailing_);
+    dst[new_idx] = src[index];
+  }
+#endif
+};
+
+
+template<typename xpu>
+void ReverseOpForward(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  const ReverseParam& param = nnvm::get<ReverseParam>(attrs.parsed);
+  CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
+  CHECK_LT(param.axis.ndim(), REVERSE_MAX_DIM);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+
+  const TShape& ishape = inputs[0].shape_;
+
+  std::vector<index_t> stride_(param.axis.ndim());
+  std::vector<index_t>  trailing_(param.axis.ndim());
+  index_t reverse_index = 0;
+  for (auto axis_iter = param.axis.begin() ; axis_iter!= param.axis.end(); ++axis_iter) {
+    CHECK_LT(*axis_iter, static_cast<int>(ishape.ndim()));
+    stride_[reverse_index] = ishape[*axis_iter];
+    trailing_[reverse_index] = 1;
+    for (int i2 = *axis_iter + 1; i2 < ishape.ndim(); ++i2) {
+      trailing_[reverse_index] *= ishape[i2];
+    }
+    reverse_index++;
+  }
+
+#ifdef __CUDACC__
+  mshadow::Tensor<xpu, 1, uint8_t> workspace =
+    ctx.requested[0].get_space_typed<xpu, 1, uint8_t>(
+      mshadow::Shape1(reverse_index * sizeof(index_t) * 2), s);
+
+  auto stride_workspace = workspace.dptr_;
+  auto trailing_workspace = workspace.dptr_ + reverse_index * sizeof(index_t);
+
+  cudaMemcpyAsync(stride_workspace, thrust::raw_pointer_cast(stride_.data()),
+                  stride_.size() * sizeof(index_t),
+                  cudaMemcpyHostToDevice, mshadow::Stream<gpu>::GetStream(s));
+  cudaMemcpyAsync(trailing_workspace, thrust::raw_pointer_cast(trailing_.data()),
+                  trailing_.size() * sizeof(index_t),
+                  cudaMemcpyHostToDevice, mshadow::Stream<gpu>::GetStream(s));
+
+#endif
+
+#ifdef __CUDACC__
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Kernel<reverse, xpu>::Launch(s, inputs[0].Size(), reverse_index,
+    inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
+    reinterpret_cast<index_t*>(stride_workspace), reinterpret_cast<index_t*>(trailing_workspace));
+  });
+#else
+  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Kernel<reverse, xpu>::Launch(s, inputs[0].Size(), reverse_index,
+    inputs[0].dptr<DType>(), outputs[0].dptr<DType>(),
+    stride_.data(), trailing_.data());
+  });
+#endif
+}
+
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 9a1df730a33b..738fd3a39c6f 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -12,56 +12,70 @@ namespace op {
 DMLC_REGISTER_PARAMETER(ReshapeParam);
 DMLC_REGISTER_PARAMETER(TransposeParam);
 DMLC_REGISTER_PARAMETER(ExpandDimParam);
-DMLC_REGISTER_PARAMETER(SimpleCropParam);
 DMLC_REGISTER_PARAMETER(ClipParam);
 DMLC_REGISTER_PARAMETER(SimpleCropAssignScalarParam);
 DMLC_REGISTER_PARAMETER(SliceParam);
-DMLC_REGISTER_PARAMETER(FlipParam);
+DMLC_REGISTER_PARAMETER(SliceAxisParam);
 DMLC_REGISTER_PARAMETER(DotParam);
 DMLC_REGISTER_PARAMETER(RepeatParam);
 DMLC_REGISTER_PARAMETER(TileParam);
+DMLC_REGISTER_PARAMETER(ReverseParam);
 
 NNVM_REGISTER_OP(Reshape)
-.MXNET_DESCRIBE("Reshape input according to a target shape spec.\n"
-"The target shape is a tuple and can be a simple list of dimensions "
-"such as (12,3) or it can incorporate special codes that correspond "
-"to contextual operations that refer to the input dimensions.\n"
-"The special codes are all expressed as integers less than 1. These "
-"codes effectively refer to a machine that pops input dims off the "
-"beginning of the input dims list and pushes resulting output dims "
-"onto the end of the output dims list, which starts empty. The codes "
-"are:\n"
-"  0  Copy     Pop one input dim and push it onto the output dims\n"
-" -1  Infer    Push a dim that is inferred later from all other output dims\n"
-" -2  CopyAll  Pop all remaining input dims and push them onto output dims\n"
-" -3  Merge2   Pop two input dims, multiply them, and push result\n"
-" -4  Split2   Pop one input dim, and read two next target shape specs,\n"
-"              push them both onto output dims (either can be -1 and will\n"
-"              be inferred from the other\n"
-" The exact mathematical behavior of these codes is given in the "
-"description of the 'shape' parameter. All non-codes (positive "
-"integers) just pop a dim off the input dims (if any), throw it away, "
-"and then push the specified integer onto the output dims.\n"
-"Examples:\n"
-"Type     Input      Target            Output\n"
-"Copy     (2,3,4)    (4,0,2)           (4,3,2)\n"
-"Copy     (2,3,4)    (2,0,0)           (2,3,4)\n"
-"Infer    (2,3,4)    (6,1,-1)          (6,1,4)\n"
-"Infer    (2,3,4)    (3,-1,8)          (3,1,8)\n"
-"CopyAll  (9,8,7)    (-2)              (9,8,7)\n"
-"CopyAll  (9,8,7)    (9,-2)            (9,8,7)\n"
-"CopyAll  (9,8,7)    (-2,1,1)          (9,8,7,1,1)\n"
-"Merge2   (3,4)      (-3)              (12)\n"
-"Merge2   (3,4,5)    (-3,0)            (12,5)\n"
-"Merge2   (3,4,5)    (0,-3)            (3,20)\n"
-"Merge2   (3,4,5,6)  (-3,0,0)          (12,5,6)\n"
-"Merge2   (3,4,5,6)  (-3,-2)           (12,5,6)\n"
-"Split2   (12)       (-4,6,2)          (6,2)\n"
-"Split2   (12)       (-4,2,6)          (2,6)\n"
-"Split2   (12)       (-4,-1,6)         (2,6)\n"
-"Split2   (12,9)     (-4,2,6,0)        (2,6,9)\n"
-"Split2   (12,9,9,9) (-4,2,6,-2)       (2,6,9,9,9)\n"
-"Split2   (12,12)    (-4,2,-1,-4,-1,2) (2,6,6,2)\n")
+.add_alias("reshape")
+.describe(R"code(Reshape array into a new shape.
+
+The shape is a tuple of int such as (2,3,4). The new shape should not change the
+array size. For example::
+
+   reshape([1,2,3,4], shape=(2,2)) = [[1,2], [3,4]]
+
+In addition, we can use special codes, which are integers less than
+1, on some shape dimensions. To inference the output shape, we set it to an
+empty tuple at beginning. When continuously pop dimensions from the original
+shape starting from the beginning, and then push translated results into the output
+shape.
+
+Each special code presents a way of translation.
+
+- ``0`` for copying one. Pop one input dimension and push into the output. For example::
+
+  - input=(2,3,4), shape=(4,0,2), output=(4,3,2)
+  - input=(2,3,4), shape=(2,0,0), output=(2,3,4)
+
+- ``-1`` for inference. Push a placeholder into the output whose value will be inferred later::
+
+  - input=(2,3,4), shape=(6,1,-1), output=(6,1,4)
+  - input=(2,3,4), shape=(3,-1,8), output=(3,1,8)
+  - input=(2,3,4), shape=(-1,), output=(24,)
+
+- ``-2`` for copying all. Pop all remaining input dimensions and push them into
+  the output::
+
+  - input=(2,3,4), shape=(-2), output=(9,8,7)
+  - input=(2,3,4), shape=(2,-2), output=(2,3,4)
+  - input=(2,3,4), shape=(-2,1,1), output=(2,3,4,1,1)
+
+- ``-3`` for merging two dimensions. Pop two input dimensions, compute the product and then
+  push into the output::
+
+  - input=(2,3,4), shape=(-3,4), output=(6,4)
+  - input=(2,3,4), shape=(0,-3), output=(2,12)
+  - input=(2,3,4), shape=(-3,-2), output=(6,4)
+
+- ``-4`` for splitting two dimensions. Pop one input dimensions, next split it
+  according to the next two dimensions (can contain one ``-1``) specified after
+  this code, then push into the output::
+
+  - input=(2,3,4), shape=(-4,1,2,-2), output=(1,2,3,4)
+  - input=(2,3,4), shape=(2,-4,-1,3,-2), output=(2,1,3,4)
+
+If the argument ``reverse`` is set to be true, then translating the input shape
+from right to left. For example, with input shape (10, 5, 4) target shape (-1,
+0), then the output shape will be (50,4) if ``reverse=1``, otherwise it will be
+(40,5).
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<ReshapeParam>)
@@ -73,13 +87,18 @@ NNVM_REGISTER_OP(Reshape)
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
 })
-.add_argument("data", "NDArray", "Input data to reshape.")
+.add_argument("data", "ndarray-or-symbol", "Input data to reshape.")
 .add_arguments(ReshapeParam::__FIELDS__());
 
 
 NNVM_REGISTER_OP(Flatten)
-.describe(R"(Flatten input into 2D by collapsing all the higher dimensions.
-A (d1, d2, ..., dK) tensor is flatten to (d1, d2* ... *dK) matrix.)")
+.add_alias("flatten")
+.describe(R"code(Flatten input into a 2-D array by collapsing the higher dimensions.
+
+Assume the input array has shape ``(d1, d2, ..., dk)``, then ``flatten`` reshapes
+the input array into shape ``(d1, d2*...*dk)``.
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
@@ -90,10 +109,37 @@ A (d1, d2, ..., dK) tensor is flatten to (d1, d2* ... *dK) matrix.)")
   [](const NodeAttrs& attrs) {
   return std::vector<std::pair<int, int> >{{0, 0}};
 })
-.add_argument("data", "NDArray", "Input data to reshape.");
+.add_argument("data", "ndarray-or-symbol", "Input data to reshape.");
 
 NNVM_REGISTER_OP(transpose)
-.MXNET_DESCRIBE("Transpose the input tensor and return a new one")
+.describe(R"code(Permute the dimensions of an array.
+
+Examples::
+
+  x = [[ 1, 2],
+       [ 3, 4]]
+
+  transpose(x) = [[ 1.,  3.],
+                  [ 2.,  4.]]
+
+  x = [[[ 1.,  2.],
+        [ 3.,  4.]],
+
+       [[ 5.,  6.],
+        [ 7.,  8.]]]
+
+  transpose(x) = [[[ 1.,  5.],
+                   [ 3.,  7.]],
+
+                  [[ 2.,  6.],
+                   [ 4.,  8.]]]
+
+  transpose(x, axes=(1,0,2)) = [[[ 1.,  2.],
+                                 [ 5.,  6.]],
+
+                                [[ 3.,  4.],
+                                 [ 7.,  8.]]]
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<TransposeParam>)
@@ -103,8 +149,9 @@ NNVM_REGISTER_OP(transpose)
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     const TransposeParam& param = nnvm::get<TransposeParam>(n->attrs.parsed);
     if (param.axes.ndim() == 0) {
-      return MakeGradNode("transpose", n, ograds,
-                          std::unordered_map<std::string, std::string>());
+      return MakeNonlossGradNode(
+          "transpose", n, ograds, {},
+          std::unordered_map<std::string, std::string>());
     } else {
       TShape axes = TShape(param.axes.ndim());
       for (index_t i = 0; i < axes.ndim(); ++i) {
@@ -112,16 +159,23 @@ NNVM_REGISTER_OP(transpose)
       }
       std::ostringstream os;
       os << axes;
-      return MakeGradNode("transpose", n, ograds, {{"axes", os.str()}});
+      return MakeNonlossGradNode(
+          "transpose", n, ograds,
+          {}, {{"axes", os.str()}});
     }
   })
 .set_attr<FCompute>("FCompute<cpu>", Transpose<cpu>)
-.add_argument("data", "NDArray", "Source input")
+.add_argument("data", "ndarray-or-symbol", "Source input")
 .add_arguments(TransposeParam::__FIELDS__());
 
 
 NNVM_REGISTER_OP(expand_dims)
-.MXNET_DESCRIBE("Expand the shape of array by inserting a new axis.")
+.describe(R"code(Insert a new axis with size 1 into the array shape
+
+For example, given ``x`` with shape ``(2,3,4)``, then ``expand_dims(x, axis=1)``
+will return a new array with shape ``(2,1,3,4)``.
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<ExpandDimParam>)
@@ -133,49 +187,65 @@ NNVM_REGISTER_OP(expand_dims)
   })
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-.add_argument("data", "NDArray", "Source input")
+.add_argument("data", "ndarray-or-symbol", "Source input")
 .add_arguments(ExpandDimParam::__FIELDS__());
 
-NNVM_REGISTER_OP(crop)
-.MXNET_DESCRIBE("(Crop the input tensor and return a new one.\n\n"
-"Requirements\n"
-"------------\n"
-"- the input and output (if explicitly given) are of the same data type,\n"
-"  and on the same device.\n"
-")")
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<SimpleCropParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", CropShape)
+NNVM_REGISTER_OP(slice)
+.describe(R"code(Crop a continuous region from the array.
+
+Assume the input array has *n* dimensions, given ``begin=(b_1, ..., b_n)`` and
+``end=(e_1, ..., e_n)``, then ``crop`` will return a region with shape
+``(e_1-b_1, ..., e_n-b_n)``. The result's *k*-th dimension contains elements
+from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``.
+
+For example::
+
+  x = [[  1.,   2.,   3.,   4.],
+       [  5.,   6.,   7.,   8.],
+       [  9.,  10.,  11.,  12.]]
+
+  crop(x, begin=(0,1), end=(2,4)) = [[ 2.,  3.,  4.],
+                                     [ 6.,  7.,  8.]]
+
+)code" ADD_FILELINE)
+.add_alias("crop")
+.set_attr_parser(ParamParser<SliceParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", SliceShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<FCompute>("FCompute<cpu>", Crop<cpu>)
-.add_argument("data", "NDArray", "Source input")
-.add_arguments(SimpleCropParam::__FIELDS__());
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
+.set_attr<FCompute>("FCompute<cpu>", Slice<cpu>)
+.add_argument("data", "ndarray-or-symbol", "Source input")
+.add_arguments(SliceParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_slice)
+.set_attr_parser(ParamParser<SliceParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", SliceBackward<cpu>);
 
-NNVM_REGISTER_OP(_crop_assign)
-.MXNET_DESCRIBE("(Assign the rhs to a cropped subset of lhs.\n\n"
+NNVM_REGISTER_OP(_slice_assign)
+.add_alias("_crop_assign")
+.MXNET_DESCRIBE("Assign the rhs to a cropped subset of lhs.\n\n"
 "Requirements\n"
 "------------\n"
 "- output should be explicitly given and be the same as lhs.\n"
-"- lhs and rhs are of the same data type, and on the same device.\n"
-")")
+"- lhs and rhs are of the same data type, and on the same device.\n")
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"lhs", "rhs"};
   })
-.set_attr_parser(ParamParser<SimpleCropParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", CropAssignShape)
+.set_attr_parser(ParamParser<SliceParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", SliceAssignShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FCompute>("FCompute<cpu>", CropAssign<cpu>)
-.add_argument("lhs", "NDArray", "Source input")
-.add_argument("rhs", "NDArray", "value to assign")
-.add_arguments(SimpleCropParam::__FIELDS__());
+.set_attr<FCompute>("FCompute<cpu>", SliceAssign<cpu>)
+.add_argument("lhs", "ndarray-or-symbol", "Source input")
+.add_argument("rhs", "ndarray-or-symbol", "value to assign")
+.add_arguments(SliceParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_crop_assign_scalar)
 .MXNET_DESCRIBE("(Assign the scalar to a cropped subset of the input.\n\n"
@@ -193,50 +263,62 @@ NNVM_REGISTER_OP(_crop_assign_scalar)
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
 .set_attr<FCompute>("FCompute<cpu>", CropAssignScalar<cpu>)
-.add_argument("data", "NDArray", "Source input")
+.add_argument("data", "ndarray-or-symbol", "Source input")
 .add_arguments(SimpleCropAssignScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(slice_axis)
-.MXNET_DESCRIBE("Slice the input along certain axis and return a sliced array."
-                " The slice will be taken from [begin, end)."
-                " end can be None and axis can be negative.")
+.describe(R"code(Slice along a given axis.
+
+Examples:
+
+  x = [[  1.,   2.,   3.,   4.],
+       [  5.,   6.,   7.,   8.],
+       [  9.,  10.,  11.,  12.]]
+
+  slice_axis(x, axis=0, begin=1, end=3) = [[  5.,   6.,   7.,   8.],
+                                           [  9.,  10.,  11.,  12.]]
+
+  slice_axis(x, axis=1, begin=0, end=2) = [[  1.,   2.],
+                                           [  5.,   6.],
+                                           [  9.,  10.]]
+
+  slice_axis(x, axis=1, begin=-3, end=-1) = [[  2.,   3.],
+                                             [  6.,   7.],
+                                             [ 10.,  11.]]
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<SliceParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", SliceShape)
+.set_attr_parser(ParamParser<SliceAxisParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", SliceAxisShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<FCompute>("FCompute<cpu>", Slice<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", SliceAxis<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice_axis"})
-.add_argument("data", "NDArray", "Source input")
-.add_arguments(SliceParam::__FIELDS__());
+.add_argument("data", "ndarray-or-symbol", "Source input")
+.add_arguments(SliceAxisParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_slice_axis)
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<SliceParam>)
+.set_attr_parser(ParamParser<SliceAxisParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", SliceGrad_<cpu>);
-
-NNVM_REGISTER_OP(flip)
-.MXNET_DESCRIBE("Flip the input tensor along axis and return a new one.")
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<FlipParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
-.set_attr<FCompute>("FCompute<cpu>", Flip<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"flip"})
-.add_argument("data", "NDArray", "Source input")
-.add_arguments(FlipParam::__FIELDS__());
+.set_attr<FCompute>("FCompute<cpu>", SliceAxisGrad_<cpu>);
 
 NNVM_REGISTER_OP(dot)
-.MXNET_DESCRIBE(
-  "Calculate dot product of two matrices or two vectors. "
-  "If matrices have more than two dimensions, will do dot "
-  "over the last (or first if transpose_a is true) axis of lhs "
-  "and the first (or last if transpose_b is true) axis of rhs. "
-  "Shape of result array will be the rest of lhs and rhs's axes "
-  "concatenated.")
+.describe(R"doc(Dot product of two arrays.
+
+``dot``'s behavior depends on the input array dimensions:
+
+- 1-D arrays: inner product of vectors
+- 2-D arrays: matrix multiplication
+- N-D arrays: a sum product over the last axis of the first input and the first
+  axis of the second input
+
+  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
+  result array will have shape `(n,m,r,s)`. It is computed by::
+
+    dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
+
+)doc" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<DotParam>)
@@ -248,8 +330,8 @@ NNVM_REGISTER_OP(dot)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
 .set_attr<FCompute>("FCompute<cpu>", DotForward_<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_dot"})
-.add_argument("lhs", "NDArray", "Left input")
-.add_argument("rhs", "NDArray", "Right input")
+.add_argument("lhs", "ndarray-or-symbol", "The first input")
+.add_argument("rhs", "ndarray-or-symbol", "The second input")
 .add_arguments(DotParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_dot)
@@ -261,8 +343,18 @@ NNVM_REGISTER_OP(_backward_dot)
 .add_arguments(DotParam::__FIELDS__());
 
 NNVM_REGISTER_OP(batch_dot)
-.MXNET_DESCRIBE("Calculate batched dot product of two matrices."
-                " (batch, M, K) X (batch, K, N) --> (batch, M, N).")
+.describe(R"doc(Batchwise dot product.
+
+``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
+``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+
+For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
+`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+which is computed by::
+
+   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+
+)doc" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<DotParam>)
@@ -278,8 +370,8 @@ NNVM_REGISTER_OP(batch_dot)
   })
 .set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
-.add_argument("lhs", "NDArray", "Left input")
-.add_argument("rhs", "NDArray", "Right input")
+.add_argument("lhs", "ndarray-or-symbol", "The first input")
+.add_argument("rhs", "ndarray-or-symbol", "The second input")
 .add_arguments(DotParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_batch_dot)
@@ -294,7 +386,14 @@ NNVM_REGISTER_OP(_backward_batch_dot)
 .set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
 
 NNVM_REGISTER_OP(clip)
-.MXNET_DESCRIBE("Clip ndarray elements to range (a_min, a_max)")
+.describe(R"code(Clip (limit) the values in an array, elementwise
+
+Given an interval, values outside the interval are clipped to the interval
+edges. That is::
+
+   clip(x) = max(min(x, a_max)), a_min)
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<ClipParam>)
@@ -302,7 +401,7 @@ NNVM_REGISTER_OP(clip)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<FCompute>("FCompute<cpu>", Clip<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_clip" })
-.add_argument("data", "NDArray", "Source input")
+.add_argument("data", "ndarray-or-symbol", "Source input")
 .add_arguments(ClipParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_clip)
@@ -313,7 +412,27 @@ NNVM_REGISTER_OP(_backward_clip)
 .set_attr<FCompute>("FCompute<cpu>", ClipGrad_<cpu>);
 
 NNVM_REGISTER_OP(repeat)
-.MXNET_DESCRIBE("Repeat elements of an array")
+.describe(R"code(Repeat elements of an array.
+
+In default, ``repeat`` flatten the input array into 1-D and then repeat the
+elements::
+
+  x = [[ 1, 2],
+       [ 3, 4]]
+
+  repeat(x, repeats=2) = [ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.]
+
+We can also choose a particular axis to repeat, in which a negative axis is
+interpreted counting from the backward::
+
+  repeat(x, repeats=2, axis=1) = [[ 1.,  1.,  2.,  2.],
+                                  [ 3.,  3.,  4.,  4.]]
+
+  repeat(x, repeats=2, axis=-1) = [[ 1.,  2.],
+                                   [ 1.,  2.],
+                                   [ 3.,  4.],
+                                   [ 3.,  4.]]
+)code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(1)
 .set_attr_parser(ParamParser<RepeatParam>)
@@ -325,7 +444,7 @@ NNVM_REGISTER_OP(repeat)
 .set_attr<nnvm::FInferType>("FInferType", RepeatOpType)
 .set_attr<FCompute>("FCompute<cpu>", RepeatOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_repeat"})
-.add_argument("data", "NDArray", "Input data array")
+.add_argument("data", "ndarray-or-symbol", "Input data array")
 .add_arguments(RepeatParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_repeat)
@@ -336,7 +455,41 @@ NNVM_REGISTER_OP(_backward_repeat)
 .set_attr<FCompute>("FCompute<cpu>", RepeatOpBackward<cpu>);
 
 NNVM_REGISTER_OP(tile)
-.MXNET_DESCRIBE("Construct an array by repeating A the number of times given by reps.")
+.describe(R"code(Repeat the whole array by multiple times.
+
+If ``reps`` has length *d*, and input array has dimension of *n*. There are
+there cases:
+
+- **n=d**. Repeat *i*-th dimension of the input by ``reps[i]`` times::
+
+    x = [[1, 2],
+         [3, 4]]
+
+    tile(x, reps=(2,3)) = [[ 1.,  2.,  1.,  2.,  1.,  2.],
+                           [ 3.,  4.,  3.,  4.,  3.,  4.],
+                           [ 1.,  2.,  1.,  2.,  1.,  2.],
+                           [ 3.,  4.,  3.,  4.,  3.,  4.]]
+
+- **n>d**. ``reps`` is promoted to length *n* by pre-pending 1’s to it. Thus for
+  an input shape ``(2,3)``, ``repos=(2,)`` is treated as ``(1,2)``::
+
+
+    tile(x, reps=(2,)) = [[ 1.,  2.,  1.,  2.],
+                          [ 3.,  4.,  3.,  4.]]
+
+- **n<d**. The input is promoted to be d-dimensional by prepending new axes. So a
+  shape ``(2,2)`` array is promoted to ``(1,2,2)`` for 3-D replication::
+
+    tile(x, reps=(2,2,3)) = [[[ 1.,  2.,  1.,  2.,  1.,  2.],
+                              [ 3.,  4.,  3.,  4.,  3.,  4.],
+                              [ 1.,  2.,  1.,  2.,  1.,  2.],
+                              [ 3.,  4.,  3.,  4.,  3.,  4.]],
+
+                             [[ 1.,  2.,  1.,  2.,  1.,  2.],
+                              [ 3.,  4.,  3.,  4.,  3.,  4.],
+                              [ 1.,  2.,  1.,  2.,  1.,  2.],
+                              [ 3.,  4.,  3.,  4.,  3.,  4.]]]
+)code" ADD_FILELINE)
 .set_num_outputs(1)
 .set_num_inputs(1)
 .set_attr_parser(ParamParser<TileParam>)
@@ -348,7 +501,7 @@ NNVM_REGISTER_OP(tile)
 .set_attr<nnvm::FInferType>("FInferType", TileOpType)
 .set_attr<FCompute>("FCompute<cpu>", TileOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_tile"})
-.add_argument("data", "NDArray", "Input data array")
+.add_argument("data", "ndarray-or-symbol", "Input data array")
 .add_arguments(TileParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_tile)
@@ -357,5 +510,37 @@ NNVM_REGISTER_OP(_backward_tile)
 .set_attr_parser(ParamParser<TileParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", TileOpBackward<cpu>);
+
+NNVM_REGISTER_OP(reverse)
+.MXNET_DESCRIBE("Reverse elements of an array with axis")
+.set_num_outputs(1)
+.set_num_inputs(1)
+.add_alias("flip")
+.set_attr_parser(ParamParser<ReverseParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+[](const NodeAttrs& attrs) {
+  return std::vector<std::string> {"data"};
+})
+.set_attr<FResourceRequest>("FResourceRequest",
+[](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest> {ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", ReverseOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_reverse" })
+.add_argument("data", "NDArray", "Input data array")
+.add_arguments(ReverseParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_reverse)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<ReverseParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FResourceRequest>("FResourceRequest",
+[](const NodeAttrs& attrs) {
+  return std::vector<ResourceRequest> {ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", ReverseOpForward<cpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index aa1367b0f3c2..96c075a7d483 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -21,23 +21,23 @@ NNVM_REGISTER_OP(transpose)
 NNVM_REGISTER_OP(expand_dims)
 .set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
 
-NNVM_REGISTER_OP(crop)
-.set_attr<FCompute>("FCompute<gpu>", Crop<gpu>);
+NNVM_REGISTER_OP(slice)
+.set_attr<FCompute>("FCompute<gpu>", Slice<gpu>);
+
+NNVM_REGISTER_OP(_backward_slice)
+.set_attr<FCompute>("FCompute<gpu>", SliceBackward<gpu>);
 
-NNVM_REGISTER_OP(_crop_assign)
-.set_attr<FCompute>("FCompute<gpu>", CropAssign<gpu>);
+NNVM_REGISTER_OP(_slice_assign)
+.set_attr<FCompute>("FCompute<gpu>", SliceAssign<gpu>);
 
 NNVM_REGISTER_OP(_crop_assign_scalar)
 .set_attr<FCompute>("FCompute<gpu>", CropAssignScalar<gpu>);
 
 NNVM_REGISTER_OP(slice_axis)
-.set_attr<FCompute>("FCompute<gpu>", Slice<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SliceAxis<gpu>);
 
 NNVM_REGISTER_OP(_backward_slice_axis)
-.set_attr<FCompute>("FCompute<gpu>", SliceGrad_<gpu>);
-
-NNVM_REGISTER_OP(flip)
-.set_attr<FCompute>("FCompute<gpu>", Flip<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SliceAxisGrad_<gpu>);
 
 NNVM_REGISTER_OP(dot)
 .set_attr<FCompute>("FCompute<gpu>", DotForward_<gpu>);
@@ -68,5 +68,11 @@ NNVM_REGISTER_OP(tile)
 
 NNVM_REGISTER_OP(_backward_tile)
 .set_attr<FCompute>("FCompute<gpu>", TileOpBackward<gpu>);
+
+NNVM_REGISTER_OP(reverse)
+.set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_reverse)
+.set_attr<FCompute>("FCompute<gpu>", ReverseOpForward<gpu>);
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index 9fc631fdea5e..b02d607d9efe 100755
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -417,12 +417,12 @@ inline bool TopKType(const nnvm::NodeAttrs& attrs,
 inline bool TopKShapeImpl(const TopKParam& param,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(in_attrs->size(), 1U);
   if (param.ret_typ == topk_enum::kReturnIndices ||
     param.ret_typ == topk_enum::kReturnMask) {
-    CHECK_EQ(out_attrs->size(), 1);
+    CHECK_EQ(out_attrs->size(), 1U);
   } else {
-    CHECK_EQ(out_attrs->size(), 2);
+    CHECK_EQ(out_attrs->size(), 2U);
   }
   TShape& in_shape = (*in_attrs)[0];
   int batch_size, element_num;  // number of batches + the size of each batch
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index d15a94c7f2fb..24c18cbf2b0b 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -15,7 +15,25 @@ DMLC_REGISTER_PARAMETER(SortParam);
 DMLC_REGISTER_PARAMETER(ArgSortParam);
 
 NNVM_REGISTER_OP(topk)
-.MXNET_DESCRIBE("Return the top k element of an input tensor along a given axis.")
+.describe(R"code(Return the top *k* elements in an array.
+
+Examples::
+
+  x = [[ 0.3,  0.2,  0.4],
+       [ 0.1,  0.3,  0.2]]
+
+  // return the index of the largest element on last axis
+  topk(x) = [[ 2.],
+             [ 1.]]
+
+  // return the value of the top-2 elements on last axis
+  topk(x, ret_typ='value', k=2) = [[ 0.4,  0.3],
+                                   [ 0.3,  0.2]]
+
+  // flatten and then return both index and value
+  topk(x, ret_typ='both', k=2, axis=None) = [ 0.4,  0.3], [ 2.,  0.]
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(TopKNumOutputs)
 .set_attr_parser(ParamParser<TopKParam>)
@@ -27,12 +45,12 @@ NNVM_REGISTER_OP(topk)
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     const TopKParam& param = nnvm::get<TopKParam>(n->attrs.parsed);
     if (param.ret_typ == topk_enum::kReturnValue || param.ret_typ == topk_enum::kReturnBoth) {
-      std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.begin() + 1);
+      std::vector<nnvm::NodeEntry> inputs;
       index_t n_out = n->num_outputs();
       for (index_t i = 0; i < n_out; ++i) {
-        heads.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+        inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
       }
-      return MakeGradNode("_backward_topk", n, heads, n->attrs.dict);
+      return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs, n->attrs.dict);
     } else {
       return MakeZeroGradNodes(n, ograds);
     }
@@ -41,7 +59,7 @@ NNVM_REGISTER_OP(topk)
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.add_argument("src", "NDArray", "Source input")
+.add_argument("src", "ndarray-or-symbol", "Source input")
 .add_arguments(TopKParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_topk)
@@ -56,7 +74,29 @@ NNVM_REGISTER_OP(_backward_topk)
 });
 
 NNVM_REGISTER_OP(sort)
-.MXNET_DESCRIBE("Return a sorted copy of an array.")
+.describe(R"code(Return a sorted copy of an array.
+
+Examples::
+
+  x = [[ 1, 4],
+       [ 3, 1]]
+
+  // sort along the last axis
+  sort(x) = [[ 1.,  4.],
+             [ 1.,  3.]]
+
+  // flatten and then sort
+  sort(x, axis=None) = [ 1.,  1.,  3.,  4.]
+
+  // sort long the first axis
+  sort(x, axis=0) = [[ 1.,  1.],
+                     [ 3.,  4.]]
+
+  // in a descend order
+  sort(x, is_ascend=0) = [[ 4.,  1.],
+                          [ 3.,  1.]]
+
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(2)
 .set_attr_parser(ParamParser<SortParam>)
@@ -67,26 +107,43 @@ NNVM_REGISTER_OP(sort)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     const SortParam& param = nnvm::get<SortParam>(n->attrs.parsed);
-    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.begin() + 1);
+    std::vector<nnvm::NodeEntry> inputs;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
-      heads.emplace_back(nnvm::NodeEntry{ n, i, 0 });
+      inputs.emplace_back(nnvm::NodeEntry{ n, i, 0 });
     }
-    return MakeGradNode("_backward_topk", n, heads,
-                         {{"axis", n->attrs.dict["axis"]},
-                          {"k", "0"},
-                          {"ret_typ", "value"},
-                          {"is_ascend", std::to_string(param.is_ascend)}});
+    return MakeNonlossGradNode("_backward_topk", n, {ograds[0]}, inputs,
+                               {{"axis", n->attrs.dict["axis"]},
+                                {"k", "0"},
+                                {"ret_typ", "value"},
+                                {"is_ascend", std::to_string(param.is_ascend)}});
   })
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.add_argument("src", "NDArray", "Source input")
+.add_argument("src", "ndarray-or-symbol", "Source input")
 .add_arguments(SortParam::__FIELDS__());
 
 NNVM_REGISTER_OP(argsort)
-.MXNET_DESCRIBE("Returns the indices that would sort an array.")
+.describe(R"code(Returns the indices that can sort an array.
+
+Examples::
+
+  x = [[ 0.3,  0.2,  0.4],
+       [ 0.1,  0.3,  0.2]]
+
+  // sort along axis -1
+  argsort(x) = [[ 1.,  0.,  2.],
+                [ 0.,  2.,  1.]]
+
+  // sort along axis 0
+  argsort(x, axis=0) = [[ 1.,  0.,  1.]
+                        [ 0.,  1.,  0.]]
+
+  // flatten and then sort
+  argsort(x, axis=None) = [ 3.,  1.,  5.,  0.,  4.,  2.]
+)code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<ArgSortParam>)
@@ -98,7 +155,7 @@ NNVM_REGISTER_OP(argsort)
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.add_argument("src", "NDArray", "Source input")
+.add_argument("src", "ndarray-or-symbol", "Source input")
 .add_arguments(ArgSortParam::__FIELDS__());
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/sample_op.cc b/src/operator/tensor/sample_op.cc
index 7f9b1094cc0a..0511528beb9d 100644
--- a/src/operator/tensor/sample_op.cc
+++ b/src/operator/tensor/sample_op.cc
@@ -13,12 +13,26 @@ DMLC_REGISTER_PARAMETER(SampleNormalParam);
 
 MXNET_OPERATOR_REGISTER_SAMPLE(uniform, SampleUniformParam)
 .add_alias("_sample_uniform")
-.describe("Sample a uniform distribution")
+.describe(R"code(Draw samples from a uniform distribution.
+
+Samples are uniformly distributed over the half-open interval [low, high)
+(includes low, but excludes high)::
+
+  nd.uniform(low=0, high=1, shape=(2,2)) = [[ 0.60276335,  0.85794562],
+                                            [ 0.54488319,  0.84725171]]
+
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", SampleUniform_<cpu>);
 
 MXNET_OPERATOR_REGISTER_SAMPLE(normal, SampleNormalParam)
 .add_alias("_sample_normal")
-.describe("Sample a normal distribution")
+.describe(R"code(Draw random samples from a normal (Gaussian) distribution.
+
+Examples::
+
+  normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478],
+                                         [-1.23474145,  1.55807114]]
+)code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", SampleNormal_<cpu>);
 
 }  // namespace op
diff --git a/src/operator/tensor/sample_op.h b/src/operator/tensor/sample_op.h
old mode 100644
new mode 100755
index 28e1f70cca26..ad41b40d9ace
--- a/src/operator/tensor/sample_op.h
+++ b/src/operator/tensor/sample_op.h
@@ -35,9 +35,13 @@ struct SampleUniformParam : public dmlc::Parameter<SampleUniformParam> {
     .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
               "Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
+    .add_enum("None", -1)
     .add_enum("float32", mshadow::kFloat32)
-    .set_default(mshadow::kFloat32)
-    .describe("DType of the output");
+    .add_enum("float64", mshadow::kFloat64)
+    .add_enum("float16", mshadow::kFloat16)
+    .set_default(-1)
+    .describe("DType of the output. If output given, set to type of output."
+              "If output not given and type not defined (dtype=None), set to float32.");
   }
 };
 
@@ -60,9 +64,13 @@ struct SampleNormalParam : public dmlc::Parameter<SampleNormalParam> {
     .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
               "Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
+    .add_enum("None", -1)
     .add_enum("float32", mshadow::kFloat32)
-    .set_default(mshadow::kFloat32)
-    .describe("DType of the output");
+    .add_enum("float64", mshadow::kFloat64)
+    .add_enum("float16", mshadow::kFloat16)
+    .set_default(-1)
+    .describe("DType of the output. If output given, set to type of output."
+              "If output not given and type not defined (dtype=None), set to float32.");
   }
 };
 
@@ -75,12 +83,23 @@ void SampleUniform_(const nnvm::NodeAttrs& attrs,
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(outputs[0].type_flag_, mshadow::kFloat32)
-      << "only support float32 rnd so far";
   const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
   mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
-  prnd->SampleUniform(&out, param.low, param.high);
+  if (outputs[0].type_flag_ != mshadow::kFloat32) {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      // Not float32: use workspace and copy to output
+      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, float> workspace =
+        ctx.requested[ResourceRequest::kTempSpace].get_space_typed<xpu, 1, float>
+        (mshadow::Shape1(out.shape_.Size()), s);
+      prnd->SampleUniform(&workspace, param.low, param.high);
+      out = reshape(tcast<DType>(workspace), mshadow::Shape2(out.shape_[0], out.shape_[1]));
+    });
+  } else {
+    // float32: write directly into output
+    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
+    prnd->SampleUniform(&out, param.low, param.high);
+  }
 }
 
 template<typename xpu>
@@ -92,16 +111,63 @@ void SampleNormal_(const nnvm::NodeAttrs& attrs,
   using namespace mxnet::op;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(outputs[0].type_flag_, mshadow::kFloat32)
-      << "only support float32 rnd so far";
   const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
   mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
-  prnd->SampleGaussian(&out, param.loc, param.scale);  // NOLINT(*)
+  if (outputs[0].type_flag_ != mshadow::kFloat32) {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      // Not float32: use workspace and copy to output
+      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+      mshadow::Tensor<xpu, 1, float> workspace =
+        ctx.requested[ResourceRequest::kTempSpace].get_space_typed<xpu, 1, float>
+        (mshadow::Shape1(out.shape_.Size()), s);
+      prnd->SampleGaussian(&workspace, param.loc, param.scale);
+      out = reshape(tcast<DType>(workspace), mshadow::Shape2(out.shape_[0], out.shape_[1]));
+    });
+  } else {
+    // float32: write directly into output
+    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
+    prnd->SampleGaussian(&out, param.loc, param.scale);
+  }
+}
+
+template<typename ParamType>
+inline bool SampleOpType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int> *in_type,
+                         std::vector<int> *out_type) {
+  const ParamType& param = nnvm::get<ParamType>(attrs.parsed);
+  CHECK_EQ(in_type->size(), 0);
+  CHECK_EQ(out_type->size(), 1);
+  int dtype = -1;
+  int dtype_out = (*out_type)[0];
+  if (dtype_out != -1) {
+    // Output type can be inferred, use it and make sure it
+    dtype = dtype_out;
+    if (param.dtype != -1) {
+      // dtype given in args, check that it matches the output type
+      CHECK_EQ(dtype_out, param.dtype) << "Output type does not match requested type: "
+      << dtype_out << " vs " << param.dtype;
+    }
+  } else {
+    // Output type can't be inferred
+    if (param.dtype != -1) {
+      // Use dtype given in args
+      dtype = param.dtype;
+    } else {
+      // Use default
+      dtype = mshadow::kFloat32;
+    }
+  }
+  bool dtype_ok = (dtype == mshadow::kFloat16) || (dtype == mshadow::kFloat32) ||
+  (dtype == mshadow::kFloat64);
+  CHECK_EQ(dtype_ok, true) << "Output type must be float16, float32, or float64: dtype is "
+  << dtype_out << " vs " << mshadow::kFloat16 << " or " << mshadow::kFloat32 << " or "
+  << mshadow::kFloat64;
+  TYPE_ASSIGN_CHECK(*out_type, 0, dtype);
+  return true;
 }
 
 inline std::vector<ResourceRequest> SampleResource(const NodeAttrs& attrs) {
-  return { ResourceRequest::kRandom };
+  return { ResourceRequest::kRandom, ResourceRequest::kTempSpace };
 }
 
 #define MXNET_OPERATOR_REGISTER_SAMPLE(name, ParamType)                 \
@@ -110,7 +176,7 @@ inline std::vector<ResourceRequest> SampleResource(const NodeAttrs& attrs) {
   .set_num_outputs(1)                                                   \
   .set_attr_parser(ParamParser<ParamType>)                              \
   .set_attr<nnvm::FInferShape>("FInferShape", InitShape<ParamType>)     \
-  .set_attr<nnvm::FInferType>("FInferType", InitType<ParamType>)                   \
+  .set_attr<nnvm::FInferType>("FInferType", SampleOpType<ParamType>)    \
   .set_attr<FResourceRequest>("FResourceRequest", SampleResource)       \
   .add_arguments(ParamType::__FIELDS__())
 }  // namespace op
diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index 113c9e128566..10ba61f5431d 100755
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -7,7 +7,14 @@
 #define MXNET_OPERATOR_TENSOR_SORT_OP_INL_CUH_
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>
+#if defined(_MSC_VER) && __CUDACC_VER__ != 80044
+// Many CUDA compilers other than V8.0.44 crash on Windows 
+#pragma warning("Potential crash on CUDA compiler detected. Switching sorting from CUB to Thrust")
+#define SORT_WITH_THRUST
+#else
 #include <cub/device/device_radix_sort.cuh>
+#undef SORT_WITH_THRUST
+#endif
 #if CUDA_VERSION >= 7000
 #include <thrust/system/cuda/execution_policy.h>
 #endif
@@ -18,12 +25,16 @@ namespace op {
 template <typename KDType, typename VDType, typename xpu>
 inline typename std::enable_if<std::is_same<xpu, gpu>::value, size_t>::type
 SortByKeyWorkspaceSize(const size_t num_keys) {
+#ifdef SORT_WITH_THRUST
+  return 0;
+#else
   size_t sortpairs_bytes = 0;
   cub::DeviceRadixSort::SortPairs<KDType, VDType>(NULL, sortpairs_bytes,
       NULL, NULL, NULL, NULL, num_keys);
   size_t keys_bytes = num_keys*sizeof(KDType);
   size_t values_bytes = num_keys*sizeof(VDType);
   return (keys_bytes + values_bytes + sortpairs_bytes);
+#endif
 }
 
 template<typename KDType, typename VDType>
@@ -34,6 +45,7 @@ inline void SortByKey(mshadow::Tensor<gpu, 1, KDType> keys, mshadow::Tensor<gpu,
   CHECK_EQ(values.CheckContiguous(), true);
 #if CUDA_VERSION >= 7000
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(keys.stream_);
+#ifndef SORT_WITH_THRUST
   if (workspace != NULL) {
     // Workspace given, sort using CUB
     CHECK_EQ(workspace->CheckContiguous(), true);
@@ -75,6 +87,7 @@ inline void SortByKey(mshadow::Tensor<gpu, 1, KDType> keys, mshadow::Tensor<gpu,
     mshadow::Copy(keys, keys_out, keys.stream_);
     mshadow::Copy(values, values_out, values.stream_);
   } else {
+#endif // SORT_WITH_THRUST
     // No workspace, sort using thrust
     thrust::device_ptr<KDType> key_iter = thrust::device_pointer_cast(keys.dptr_);
     thrust::device_ptr<VDType> value_iter = thrust::device_pointer_cast(values.dptr_);
@@ -87,7 +100,9 @@ inline void SortByKey(mshadow::Tensor<gpu, 1, KDType> keys, mshadow::Tensor<gpu,
         thrust::cuda::par.on(stream),
         key_iter, key_iter + keys.size(0), value_iter, thrust::greater<KDType>());
     }
+#ifndef SORT_WITH_THRUST
   }
+#endif // SORT_WITH_THRUST
   MSHADOW_CUDA_POST_KERNEL_CHECK(SortByKey);
 #else
   LOG(FATAL) << "SortByKey is only supported for CUDA version >=7.0!";
diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index c32ce419a558..a10ccb1f7626 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -77,7 +77,7 @@ class UpSamplingNearestOp : public Operator {
     using namespace mshadow;
     using namespace mshadow::expr;
     CHECK_EQ(in_data.size(), static_cast<size_t>(param_.num_args));
-    CHECK_EQ(out_data.size(), 1);
+    CHECK_EQ(out_data.size(), 1U);
     if (req[up_enum::kOut] == kNullOp) {
       return;
     }
@@ -115,7 +115,7 @@ class UpSamplingNearestOp : public Operator {
                         const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(out_grad.size(), 1U);
     CHECK_EQ(in_grad.size(), static_cast<size_t>(param_.num_args));
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 4, DType> grad = out_grad[up_enum::kOut].get<xpu, 4, DType>(s);
@@ -181,7 +181,7 @@ class UpSamplingProp : public OperatorProperty {
     if (param_.sample_type == up_enum::kNearest) {
       std::vector<std::string> ret;
       for (int i = 0; i < param_.num_args; ++i) {
-        ret.push_back(std::string("arg") + static_cast<char>('0' + i));
+        ret.push_back(std::string("arg") + std::to_string(i));
       }
       return ret;
     } else {
@@ -192,19 +192,19 @@ class UpSamplingProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
-    CHECK_GE(in_shape->size(), 1);
+    CHECK_GE(in_shape->size(), 1U);
     const TShape &dshape = (*in_shape)[0];
     TShape oshape = dshape;
     if (param_.sample_type == up_enum::kNearest) {
       CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
       oshape[1] = 0;
       for (auto& shape : *in_shape) {
-        CHECK_EQ(shape.ndim(), 4) << \
+        CHECK_EQ(shape.ndim(), 4U) << \
           "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)";
         int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale;
-        CHECK_EQ(oh%shape[2], 0) << "UpSamplingNearest: input height of " << shape[2] << \
+        CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \
           "does not divide output height of " << oh;
-        CHECK_EQ(ow%shape[3], 0) << "UpSamplingNearest: input width of " << shape[3] << \
+        CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \
           "does not divide output width of " << ow;
         if (param_.multi_input_mode == up_enum::kSum) {
           CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \
@@ -215,8 +215,8 @@ class UpSamplingProp : public OperatorProperty {
         }
       }
     } else {
-      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
-      CHECK_EQ(dshape.ndim(), 4) << \
+      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
+      CHECK_EQ(dshape.ndim(), 4U) << \
         "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)";
       if (dshape.ndim() ==  0) return false;
       int kernel = 2 * param_.scale - param_.scale % 2;
@@ -235,7 +235,7 @@ class UpSamplingProp : public OperatorProperty {
   bool InferType(std::vector<int> *in_type,
                  std::vector<int> *out_type,
                  std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
+    CHECK_GE(in_type->size(), 1U);
     int dtype = (*in_type)[0];
     CHECK_NE(dtype, -1) << "First input must have specified type";
     for (index_t i = 0; i < in_type->size(); ++i) {
diff --git a/src/operator/upsampling.cc b/src/operator/upsampling.cc
index dd35a581e0ca..284afc57e856 100644
--- a/src/operator/upsampling.cc
+++ b/src/operator/upsampling.cc
@@ -33,8 +33,6 @@ Operator *CreateOp<cpu>(UpSamplingParam param, int dtype) {
       p.stride = TShape(shape, shape + 2);
       shape[0] = shape[1] = pad;
       p.pad = TShape(shape, shape + 2);
-      shape[0] = shape[1] = 0;
-      p.target_shape = TShape(shape, shape + 2);
       op = new DeconvolutionOp<cpu, DType>(p);
     } else {
       LOG(FATAL) << "Unknown sample type";
diff --git a/src/operator/upsampling.cu b/src/operator/upsampling.cu
index 1a96091472d5..95864e430010 100644
--- a/src/operator/upsampling.cu
+++ b/src/operator/upsampling.cu
@@ -32,8 +32,6 @@ Operator *CreateOp<gpu>(UpSamplingParam param, int dtype) {
       p.stride = TShape(shape, shape + 2);
       shape[0] = shape[1] = pad;
       p.pad = TShape(shape, shape + 2);
-      shape[0] = shape[1] = 0;
-      p.target_shape = TShape(shape, shape + 2);
       op = new DeconvolutionOp<gpu, DType>(p);
     } else {
       LOG(FATAL) << "Unknown sample type";
diff --git a/src/resource.cc b/src/resource.cc
index 91b9d2c3d82e..60e40d1837a1 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -195,7 +195,7 @@ class ResourceManagerImpl : public ResourceManager {
         resource[i].ptr_ = &space[i];
         resource[i].req = ResourceRequest(ResourceRequest::kTempSpace);
         space[i].ctx = ctx;
-        CHECK_EQ(space[i].handle.size, 0);
+        CHECK_EQ(space[i].handle.size, 0U);
       }
     }
     ~ResourceTempSpace() {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index d0d332ae2871..5e0050c04b2f 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -40,11 +40,12 @@ class GPUPooledStorageManager final : public StorageManager {
     ReleaseAll();
   }
 
-  void* Alloc(size_t size) override;
-  void Free(void* ptr, size_t size) override;
+  void* Alloc(size_t raw_size) override;
+  void Free(void* ptr, size_t raw_size) override;
 
-  void DirectFree(void* ptr, size_t size) override {
+  void DirectFree(void* ptr, size_t raw_size) override {
     cudaError_t err = cudaFree(ptr);
+    size_t size = raw_size + NDEV;
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
@@ -60,13 +61,16 @@ class GPUPooledStorageManager final : public StorageManager {
   size_t used_memory_ = 0;
   // percentage of reserved memory
   int reserve_;
+  // number of devices
+  const int NDEV = 32;
   // memory pool
   std::unordered_map<size_t, std::vector<void*>> memory_pool_;
   DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
 };  // class GPUPooledStorageManager
 
-void* GPUPooledStorageManager::Alloc(size_t size) {
+void* GPUPooledStorageManager::Alloc(size_t raw_size) {
   std::lock_guard<std::mutex> lock(mutex_);
+  size_t size = raw_size + NDEV;
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
     size_t free, total;
@@ -89,8 +93,9 @@ void* GPUPooledStorageManager::Alloc(size_t size) {
   }
 }
 
-void GPUPooledStorageManager::Free(void* ptr, size_t size) {
+void GPUPooledStorageManager::Free(void* ptr, size_t raw_size) {
   std::lock_guard<std::mutex> lock(mutex_);
+  size_t size = raw_size + NDEV;
   auto&& reuse_pool = memory_pool_[size];
   reuse_pool.push_back(ptr);
 }
@@ -98,7 +103,7 @@ void GPUPooledStorageManager::Free(void* ptr, size_t size) {
 void GPUPooledStorageManager::ReleaseAll() {
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
-      DirectFree(j, i.first);
+      DirectFree(j, i.first - NDEV);
     }
   }
   memory_pool_.clear();
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index d80c64b4685c..64731cf92456 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -9,7 +9,6 @@
 #include "./naive_storage_manager.h"
 #include "./pooled_storage_manager.h"
 #include "./cpu_device_storage.h"
-#include "./gpu_device_storage.h"
 #include "./pinned_memory_storage.h"
 #include "../common/cuda_utils.h"
 #include "../common/lazy_alloc_array.h"
diff --git a/tests/.gitignore b/tests/.gitignore
index 1b2fb8f6589a..d6459089c245 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,2 +1 @@
-*_test
 *_unittest
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 000000000000..c3a131575088
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,54 @@
+
+# ---[ Google Test
+if(NOT GTEST_ROOT)
+  if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
+    add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/googletest")
+    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
+    set(GTEST_BOTH_LIBRARIES gtest gtest_main)
+    set(GTEST_FOUND ON)
+  elseif(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/gtest")
+    add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/gtest")
+    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/gtest")
+    set(GTEST_BOTH_LIBRARIES gtest gtest_main)
+    set(GTEST_FOUND ON)
+  else()
+    find_package(GTest)
+  endif()
+endif()
+
+if(NOT MSVC)
+  set(UNITTEST_STATIC_LINK OFF)
+endif()
+
+if(GTEST_FOUND)
+
+  enable_testing()
+
+  file(GLOB_RECURSE UNIT_TEST_SOURCE "cpp/*.cc")
+
+  add_executable(${PROJECT_NAME}_unit_tests ${UNIT_TEST_SOURCE})
+
+  if(UNITTEST_STATIC_LINK)
+    target_link_libraries(${PROJECT_NAME}_unit_tests
+      ${GTEST_LIBRARY}
+      rt
+      ${BEGIN_WHOLE_ARCHIVE} mxnet_static ${END_WHOLE_ARCHIVE}
+      dmlccore
+      ${mxnet_LINKER_LIBS}
+      )
+  else()
+    message(STATUS " OpenBLAS_LIB: ${OpenBLAS_LIB}")
+    target_link_libraries(${PROJECT_NAME}_unit_tests
+      ${GTEST_LIBRARY}
+      rt
+      dmlccore
+      ${nnvm_LINKER_LIBS}
+      ${mxnet_LINKER_LIBS}
+      mxnet
+      )
+  endif()
+
+  add_test(AllTestsIn${PROJECT_NAME}UnitTests ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${PROJECT_NAME}_unit_tests)
+
+endif()
+
diff --git a/tests/ci_build/Dockerfile.amzn_linux_cpu b/tests/ci_build/Dockerfile.amzn_linux_cpu
index adeae122d639..d1fb4fc8ab0b 100644
--- a/tests/ci_build/Dockerfile.amzn_linux_cpu
+++ b/tests/ci_build/Dockerfile.amzn_linux_cpu
@@ -3,7 +3,7 @@ MAINTAINER Ly Nguyen <lynguyen@amazon.com>
 
 COPY install/* /install/
 
-RUN yum install -y git wget
+RUN yum install -y git wget sudo
 
 RUN chmod -R 755 install
 RUN /install/install_opencv.sh
diff --git a/tests/ci_build/Dockerfile.caffe_gpu b/tests/ci_build/Dockerfile.caffe_gpu
new file mode 100644
index 000000000000..fff5a027e842
--- /dev/null
+++ b/tests/ci_build/Dockerfile.caffe_gpu
@@ -0,0 +1,24 @@
+FROM nvidia/cuda:7.5-cudnn5-devel
+
+COPY install/ubuntu_*.sh /install/
+
+RUN /install/ubuntu_install_core.sh
+RUN /install/ubuntu_install_python.sh
+
+RUN apt-get install -y libprotobuf-dev libleveldb-dev \
+    libsnappy-dev libopencv-dev libhdf5-serial-dev protobuf-compiler \
+    libatlas-base-dev python-dev libgflags-dev libgoogle-glog-dev liblmdb-dev \
+    python-numpy
+
+RUN apt-get install -y --no-install-recommends libboost-all-dev
+
+RUN cd /; git clone http://github.com/BVLC/caffe.git; cd caffe; \
+    cp Makefile.config.example Makefile.config
+
+RUN echo "CPU_ONLY := 1" >> /caffe/Makefile.config
+
+RUN cd caffe; make all pycaffe -j$(nproc)
+
+RUN cd caffe/python; for req in $(cat requirements.txt); do pip2 install $req; done
+
+ENV PYTHONPATH=${PYTHONPATH}:/caffe/python
diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
new file mode 100644
index 000000000000..1be21b03b21d
--- /dev/null
+++ b/tests/ci_build/Dockerfile.cpu
@@ -0,0 +1,7 @@
+FROM ubuntu:14.04
+
+COPY install/ubuntu_*.sh /install/
+
+RUN /install/ubuntu_install_core.sh
+RUN /install/ubuntu_install_python.sh
+RUN /install/ubuntu_install_scala.sh
diff --git a/tests/ci_build/Dockerfile.crosstool b/tests/ci_build/Dockerfile.crosstool
new file mode 100644
index 000000000000..371cdc730bb8
--- /dev/null
+++ b/tests/ci_build/Dockerfile.crosstool
@@ -0,0 +1,23 @@
+FROM ubuntu
+MAINTAINER Aran Khanna <arankhan@amazon.com>
+
+# UPDATE BOX
+RUN apt-get update && apt-get -y upgrade
+
+# TOOLCHAIN DEPS
+RUN apt-get install -y python python-setuptools python-pip python-dev unzip gfortran
+RUN apt-get install -y git bison cvs flex gperf texinfo automake libtool help2man make libtool-bin libncurses5-dev g++ cmake wget gawk
+RUN pip install numpy nose
+
+# BUILD TOOLCHAIN
+RUN git clone https://github.com/arank/crosstool-NG
+RUN cd crosstool-NG && ./bootstrap && ./configure && make && make install
+
+RUN useradd -ms /bin/bash aran
+RUN cd && cp -R .profile .bashrc /home/aran
+ADD . /home/aran/build
+
+RUN chown -R aran:aran /home/aran
+
+RUN cd /home/aran/build && su -m aran -c "export HOME=/home/aran;ct-ng arm-unknown-linux-gnueabi;ct-ng build"
+
diff --git a/tests/ci_build/Dockerfile.doc b/tests/ci_build/Dockerfile.doc
new file mode 100644
index 000000000000..f01d2fbcb3b1
--- /dev/null
+++ b/tests/ci_build/Dockerfile.doc
@@ -0,0 +1,10 @@
+FROM ubuntu:14.04
+
+COPY install/ubuntu_*.sh /install/
+
+RUN /install/ubuntu_install_core.sh
+RUN /install/ubuntu_install_python.sh
+RUN /install/ubuntu_install_scala.sh
+
+RUN apt-get install -y doxygen
+RUN pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock==1.0.1 recommonmark
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
new file mode 100644
index 000000000000..be669dbd1635
--- /dev/null
+++ b/tests/ci_build/Dockerfile.gpu
@@ -0,0 +1,7 @@
+FROM nvidia/cuda:7.5-cudnn5-devel
+
+COPY install/ubuntu_*.sh /install/
+
+RUN /install/ubuntu_install_core.sh
+RUN /install/ubuntu_install_python.sh
+RUN /install/ubuntu_install_scala.sh
diff --git a/tests/ci_build/Dockerfile.lint b/tests/ci_build/Dockerfile.lint
new file mode 100644
index 000000000000..b19b7676ec45
--- /dev/null
+++ b/tests/ci_build/Dockerfile.lint
@@ -0,0 +1,5 @@
+# For lint test
+FROM ubuntu:14.04
+
+RUN apt-get update && apt-get install -y python-pip
+RUN pip install cpplint pylint
diff --git a/tests/ci_build/Dockerfile.mklml_gpu b/tests/ci_build/Dockerfile.mklml_gpu
new file mode 100644
index 000000000000..0bdda62ce9ca
--- /dev/null
+++ b/tests/ci_build/Dockerfile.mklml_gpu
@@ -0,0 +1,14 @@
+FROM nvidia/cuda:7.5-cudnn5-devel
+# the reason we used a gpu base container because we are going to test MKLDNN
+# operator implementation against GPU implementation
+
+COPY install/ubuntu_*.sh /install/
+
+RUN /install/ubuntu_install_core.sh
+RUN /install/ubuntu_install_python.sh
+RUN /install/ubuntu_install_scala.sh
+
+RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/dmlc/web-data/raw/master/mxnet/mklml-release/mklml_lnx_2017.0.2.20170209.tgz
+RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
+
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/Dockerfile.spell_checker b/tests/ci_build/Dockerfile.spell_checker
new file mode 100644
index 000000000000..c487959c7dc1
--- /dev/null
+++ b/tests/ci_build/Dockerfile.spell_checker
@@ -0,0 +1,6 @@
+# For spell checker
+FROM mxnet/aml:latest
+
+RUN yum install -y enchant
+RUN pip install pyenchant grammar-check html2text
+RUN pip install sphinx==1.5.1 CommonMark==0.5.4 breathe mock==1.0.1 recommonmark
diff --git a/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 b/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5
index 0089610e9505..e9810af6b72c 100644
--- a/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5
+++ b/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5
@@ -30,5 +30,5 @@ RUN apt-get install -y software-properties-common r-base-core libcurl4-openssl-d
 RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
 RUN add-apt-repository -y ppa:marutter/rdev
 RUN apt-get update && apt-get -y upgrade
-RUN apt-get install -y r-base r-base-dev
+RUN DEBIAN_FRONTEND=noninteractive apt-get -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confnew" install r-base r-base-dev
 RUN Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
diff --git a/tests/ci_build/README.md b/tests/ci_build/README.md
new file mode 100644
index 000000000000..4d647f2631c1
--- /dev/null
+++ b/tests/ci_build/README.md
@@ -0,0 +1,41 @@
+# MXNET Builds
+
+This directory contains the files and setup instructions to run all tests. They
+are running on [ci.mxnet.io](http://ci.mxnet.io/blue/pipelines). But you can also
+run them locally easily.
+
+## Run locally
+
+To run locally, we need to first install
+[docker](https://docs.docker.com/engine/installation/) and
+[nvidia-docker](https://github.com/NVIDIA/nvidia-docker/wiki).
+
+We may use the AWS EC2 AMI `ami-d73bb4b7` available at US West (Oregon) which
+has both pre-installed.
+
+Then we can run the tasks defined in the [Jenkinsfile](../../Jenkinsfile) by
+using (`ci_build.sh`)[./ci_build.sh]. For example
+
+- lint the python codes
+
+  ```bash
+  ./ci_build.sh lint make pylint
+  ```
+
+- build codes with CUDA supports
+
+  ```bash
+  ./ci_build.sh gpu make -j$(nproc) USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
+  ```
+
+- do the python unittest
+
+  ```bash
+  ./ci_build.sh gpu PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/unittest'
+  ```
+
+- build the documents. The results will be available at `docs/_build/html`
+
+  ```bash
+  tests/ci_build/ci_build.sh doc make -C docs html
+  ```
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
new file mode 100755
index 000000000000..b806dd9086cf
--- /dev/null
+++ b/tests/ci_build/ci_build.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+#
+# Execute command within a docker container
+#
+# Usage: ci_build.sh <CONTAINER_TYPE> [--dockerfile <DOCKERFILE_PATH>] [-it]
+#                    <COMMAND>
+#
+# CONTAINER_TYPE: Type of the docker container used the run the build: e.g.,
+#                 (cpu | gpu)
+#
+# DOCKERFILE_PATH: (Optional) Path to the Dockerfile used for docker build.  If
+#                  this optional value is not supplied (via the --dockerfile
+#                  flag), will use Dockerfile.CONTAINER_TYPE in default
+#
+# COMMAND: Command to be executed in the docker container
+#
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+# Get the command line arguments.
+CONTAINER_TYPE=$( echo "$1" | tr '[:upper:]' '[:lower:]' )
+shift 1
+
+# Dockerfile to be used in docker build
+DOCKERFILE_PATH="${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}"
+DOCKER_CONTEXT_PATH="${SCRIPT_DIR}"
+
+if [[ "$1" == "--dockerfile" ]]; then
+    DOCKERFILE_PATH="$2"
+    DOCKER_CONTEXT_PATH=$(dirname "${DOCKERFILE_PATH}")
+    echo "Using custom Dockerfile path: ${DOCKERFILE_PATH}"
+    echo "Using custom docker build context path: ${DOCKER_CONTEXT_PATH}"
+    shift 2
+fi
+
+if [[ "$1" == "-it" ]]; then
+    CI_DOCKER_EXTRA_PARAMS+=('-it')
+    shift 1
+fi
+
+if [[ ! -f "${DOCKERFILE_PATH}" ]]; then
+    echo "Invalid Dockerfile path: \"${DOCKERFILE_PATH}\""
+    exit 1
+fi
+
+COMMAND=("$@")
+
+# Validate command line arguments.
+if [ "$#" -lt 1 ] || [ ! -e "${SCRIPT_DIR}/Dockerfile.${CONTAINER_TYPE}" ]; then
+    supported_container_types=$( ls -1 ${SCRIPT_DIR}/Dockerfile.* | \
+        sed -n 's/.*Dockerfile\.\([^\/]*\)/\1/p' | tr '\n' ' ' )
+      echo "Usage: $(basename $0) CONTAINER_TYPE COMMAND"
+      echo "       CONTAINER_TYPE can be one of [${supported_container_types}]"
+      echo "       COMMAND is a command (with arguments) to run inside"
+      echo "               the container."
+      exit 1
+fi
+
+# Use nvidia-docker if the container is GPU.
+if [[ "${CONTAINER_TYPE}" == *"gpu"* ]]; then
+    DOCKER_BINARY="nvidia-docker"
+else
+    DOCKER_BINARY="docker"
+fi
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+    test / == "$PWD" && return || \
+        test -e "$1" && echo "$PWD" && return || \
+        cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE and BUILD_TAG. Jenkins will set them for you or we pick
+# reasonable defaults if you run it outside of Jenkins.
+WORKSPACE="${WORKSPACE:-${SCRIPT_DIR}/../../}"
+BUILD_TAG="${BUILD_TAG:-mx-ci}"
+
+# Determine the docker image name
+DOCKER_IMG_NAME="${BUILD_TAG}.${CONTAINER_TYPE}"
+
+# Under Jenkins matrix build, the build tag may contain characters such as
+# commas (,) and equal signs (=), which are not valid inside docker image names.
+DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | sed -e 's/=/_/g' -e 's/,/-/g')
+
+# Convert to all lower-case, as per requirement of Docker image names
+DOCKER_IMG_NAME=$(echo "${DOCKER_IMG_NAME}" | tr '[:upper:]' '[:lower:]')
+
+# Print arguments.
+echo "WORKSPACE: ${WORKSPACE}"
+echo "CI_DOCKER_EXTRA_PARAMS: ${CI_DOCKER_EXTRA_PARAMS[@]}"
+echo "COMMAND: ${COMMAND[@]}"
+echo "CONTAINER_TYPE: ${CONTAINER_TYPE}"
+echo "BUILD_TAG: ${BUILD_TAG}"
+echo "DOCKER CONTAINER NAME: ${DOCKER_IMG_NAME}"
+echo ""
+
+
+# Build the docker container.
+echo "Building container (${DOCKER_IMG_NAME})..."
+docker build -t ${DOCKER_IMG_NAME} \
+    -f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}"
+
+# Check docker build status
+if [[ $? != "0" ]]; then
+    echo "ERROR: docker build failed."
+    exit 1
+fi
+
+# Run the command inside the container.
+echo "Running '${COMMAND[@]}' inside ${DOCKER_IMG_NAME}..."
+
+# By default we cleanup - remove the container once it finish running (--rm)
+# and share the PID namespace (--pid=host) so the process inside does not have
+# pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
+${DOCKER_BINARY} run --rm --pid=host \
+    -v ${WORKSPACE}:/workspace \
+    -w /workspace \
+    -e "CI_BUILD_HOME=${WORKSPACE}" \
+    -e "CI_BUILD_USER=$(id -u -n)" \
+    -e "CI_BUILD_UID=$(id -u)" \
+    -e "CI_BUILD_GROUP=$(id -g -n)" \
+    -e "CI_BUILD_GID=$(id -g)" \
+    ${CI_DOCKER_EXTRA_PARAMS[@]} \
+    ${DOCKER_IMG_NAME} \
+    tests/ci_build/with_the_same_user \
+    ${COMMAND[@]}
diff --git a/tests/ci_build/install/ubuntu_install_core.sh b/tests/ci_build/install/ubuntu_install_core.sh
new file mode 100755
index 000000000000..dacd30b4af71
--- /dev/null
+++ b/tests/ci_build/install/ubuntu_install_core.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+# install libraries for building mxnet c++ core on ubuntu
+
+apt-get update && apt-get install -y \
+    build-essential git libopenblas-dev libopencv-dev \
+    libcurl4-openssl-dev libgtest-dev cmake wget unzip
+
+cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
diff --git a/tests/ci_build/install/ubuntu_install_python.sh b/tests/ci_build/install/ubuntu_install_python.sh
new file mode 100755
index 000000000000..0459bb9198c4
--- /dev/null
+++ b/tests/ci_build/install/ubuntu_install_python.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's python package on ubuntu
+
+apt-get update && apt-get install -y python-dev python3-dev
+
+# the version of the pip shipped with ubuntu may be too lower, install a recent version here
+cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py
+
+pip2 install nose pylint numpy nose-timer requests
+pip3 install nose pylint numpy nose-timer requests
diff --git a/tests/ci_build/install/ubuntu_install_scala.sh b/tests/ci_build/install/ubuntu_install_scala.sh
new file mode 100755
index 000000000000..dcdd4bc72b5d
--- /dev/null
+++ b/tests/ci_build/install/ubuntu_install_scala.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's scala package on ubuntu
+
+apt-get update && apt-get install -y \
+    maven default-jdk
diff --git a/tests/ci_build/pylintrc b/tests/ci_build/pylintrc
new file mode 100644
index 000000000000..4dcd6fe15705
--- /dev/null
+++ b/tests/ci_build/pylintrc
@@ -0,0 +1,408 @@
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+
+# Add files or directories matching the regex patterns to the blacklist. The
+# regex matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=8
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+# A comma-separated list of package or module names from where C extensions may
+# be loaded. Extensions are loading into the active Python interpreter and may
+# run arbitrary code
+extension-pkg-whitelist=numpy,opencv
+
+# Allow optimization of some AST trees. This will activate a peephole AST
+# optimizer, which will apply various small optimizations. For instance, it can
+# be used to obtain the result of joining multiple strings with the addition
+# operator. Joining a lot of strings can lead to a maximum recursion error in
+# Pylint and this flag can prevent that. It has one side effect, the resulting
+# AST will be different than the one from reality. This option is deprecated
+# and it will be removed in Pylint 2.0.
+optimize-ast=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+enable=indexing-exception,old-raise-syntax
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,superfluous-parens
+# disable=unicode-builtin,delslice-method,using-cmp-argument,setslice-method,dict-view-method,parameter-unpacking,range-builtin-not-iterating,print-statement,file-builtin,old-raise-syntax,basestring-builtin,execfile-builtin,indexing-exception,import-star-module-level,coerce-method,long-builtin,old-ne-operator,old-division,no-absolute-import,raw_input-builtin,old-octal-literal,oct-method,xrange-builtin,hex-method,unpacking-in-except,nonzero-method,raising-string,intern-builtin,reload-builtin,metaclass-assignment,cmp-method,filter-builtin-not-iterating,apply-builtin,map-builtin-not-iterating,next-method-called,unichr-builtin,buffer-builtin,dict-iter-method,input-builtin,coerce-builtin,getslice-method,useless-suppression,standarderror-builtin,zip-builtin-not-iterating,suppressed-message,cmp-builtin,backtick,long-suffix,reduce-builtin,round-builtin
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]". This option is deprecated
+# and it will be removed in Pylint 2.0.
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=100
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled. `dict-
+# separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
+# `trailing-comma` allows a space between comma and closing bracket: (a, ).
+# `empty-line` allows space-only lines.
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
+# tab).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six.moves,future.builtins
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,_,a,b,op,x,y,wd,lr,kv,k,v,s,p,h,c,m,n,X,t,g,f
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty
+
+# Regular expression matching correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Naming hint for module names
+module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression matching correct constant names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Naming hint for constant names
+const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Naming hint for inline iteration names
+inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
+
+# Regular expression matching correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for method names
+method-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Naming hint for class attribute names
+class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression matching correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for argument names
+argument-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for attribute names
+attr-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Naming hint for function names
+function-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Naming hint for class names
+class-name-hint=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=^_
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+
+[ELIF]
+
+# Maximum number of nested blocks for function / method body
+max-nested-blocks=5
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=optparse
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of boolean expressions in a if statement
+max-bool-expr=5
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
diff --git a/tests/ci_build/with_the_same_user b/tests/ci_build/with_the_same_user
new file mode 100755
index 000000000000..922015602ee7
--- /dev/null
+++ b/tests/ci_build/with_the_same_user
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# This script is a wrapper creating the same user inside container as the one
+# running the ci_build.sh outside the container. It also set the home directory
+# for the user inside container to match the same absolute path as the workspace
+# outside of container.  Do not run this manually. It does not make sense. It is
+# intended to be called by ci_build.sh only.
+
+set -e
+
+COMMAND=("$@")
+
+if ! touch /this_is_writable_file_system; then
+  echo "You can't write to your filesystem!"
+  echo "If you are in Docker you should check you do not have too many images" \
+      "with too many files in them. Docker has some issue with it."
+  exit 1
+else
+  rm /this_is_writable_file_system
+fi
+
+getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
+    --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
+    --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}"
+usermod -a -G sudo "${CI_BUILD_USER}"
+echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo
+
+sudo -u "#${CI_BUILD_UID}" --preserve-env "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" \
+"HOME=${CI_BUILD_HOME}" ${COMMAND[@]}
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
deleted file mode 100644
index bbab6843e5bb..000000000000
--- a/tests/cpp/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-enable_testing()
-find_package(GTest QUIET)
-if(GTest_FOUND)
-    include_directories(${GTEST_INCLUDE_DIRS})
-    add_executable(storage_test storage_test.cc)
-    add_executable(threaded_engine_test threaded_engine_test.cc)
-    target_link_libraries(storage_test mxnet)
-    target_link_libraries(threaded_engine_test mxnet)
-endif()
\ No newline at end of file
diff --git a/tests/cpp/storage_test.cc b/tests/cpp/storage_test.cc
index 66b125a7d7a6..6f16a0e71e7a 100644
--- a/tests/cpp/storage_test.cc
+++ b/tests/cpp/storage_test.cc
@@ -3,6 +3,8 @@
 #include <dmlc/logging.h>
 #include <mxnet/storage.h>
 
+extern bool unitTestsWithCuda;
+
 TEST(Storage, Basic_CPU) {
   constexpr size_t kSize = 1024;
   auto&& storage = mxnet::Storage::Get();
@@ -10,33 +12,47 @@ TEST(Storage, Basic_CPU) {
   auto&& handle = storage->Alloc(kSize, context_cpu);
   EXPECT_EQ(handle.ctx, context_cpu);
   EXPECT_EQ(handle.size, kSize);
-  auto ptr = handle.dptr;
   storage->Free(handle);
   handle = storage->Alloc(kSize, context_cpu);
   EXPECT_EQ(handle.ctx, context_cpu);
   EXPECT_EQ(handle.size, kSize);
-  EXPECT_EQ(handle.dptr, ptr);
+  storage->Free(handle);
 }
 
 #if MXNET_USE_CUDA
+
+static bool checkForWorkingCuda()
+{
+  int count = 0;
+  if (cudaSuccess == cudaGetDeviceCount(&count)) {
+    if (count == 0) return -1;
+    for (int device = 0; device < count; ++device) {
+      cudaDeviceProp prop;
+      if (cudaSuccess == cudaGetDeviceProperties(&prop, device)) {
+        std::printf("%d.%d ", prop.major, prop.minor);
+        return true;
+      }
+    }
+  }
+  std::fprintf(stderr, "Warning: Could not find working CUDA driver\n");
+  return false;
+}
+
 TEST(Storage, Basic_GPU) {
-  constexpr size_t kSize = 1024;
-  mxnet::Context context_gpu = mxnet::Context::GPU(0);
-  auto&& storage = mxnet::Storage::Get();
-  auto&& handle = storage->Alloc(kSize, context_gpu);
-  assert(handle.ctx == context_gpu);
-  assert(handle.size == kSize);
-  auto ptr = handle.dptr;
-  storage->Free(handle);
-  handle = storage->Alloc(kSize, context_gpu);
-  EXPECT_EQ(handle.ctx, context_gpu);
-  EXPECT_EQ(handle.size, kSize);
-  EXPECT_EQ(handle.dptr, ptr);
+  if(unitTestsWithCuda || checkForWorkingCuda()) {
+    constexpr size_t kSize = 1024;
+    mxnet::Context context_gpu = mxnet::Context::GPU(0);
+    auto &&storage = mxnet::Storage::Get();
+    auto &&handle = storage->Alloc(kSize, context_gpu);
+    assert(handle.ctx == context_gpu);
+    assert(handle.size == kSize);
+    auto ptr = handle.dptr;
+    storage->Free(handle);
+    handle = storage->Alloc(kSize, context_gpu);
+    EXPECT_EQ(handle.ctx, context_gpu);
+    EXPECT_EQ(handle.size, kSize);
+    EXPECT_EQ(handle.dptr, ptr);
+  }
 }
 #endif  // MXNET_USE_CUDA
 
-int main(int argc, char ** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  return RUN_ALL_TESTS();
-}
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
new file mode 100644
index 000000000000..abea60029d88
--- /dev/null
+++ b/tests/cpp/test_main.cc
@@ -0,0 +1,34 @@
+#include <gtest/gtest.h>
+
+#ifdef USE_BREAKPAD
+#include <breakpad/client/linux/handler/exception_handler.h>
+
+static bool dumpCallback(const google_breakpad::MinidumpDescriptor& descriptor, void* context, bool succeeded) {
+  printf("Dump path: %s\n", descriptor.path());
+  return succeeded;
+}
+#endif
+
+bool unitTestsWithCuda = false;
+
+int main(int argc, char ** argv) {
+
+#ifdef USE_BREAKPAD
+  google_breakpad::MinidumpDescriptor descriptor("/tmp");
+  google_breakpad::ExceptionHandler eh(descriptor, NULL, dumpCallback, NULL, true, -1);
+#endif
+
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+
+  for(int x = 1; x < argc; ++x)
+  {
+    // force checks with CUDA
+    if(!strcmp(argv[x], "--with-cuda"))
+    {
+      unitTestsWithCuda = true;
+    }
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/cpp/threaded_engine_test.cc b/tests/cpp/threaded_engine_test.cc
index 336bf3d8891b..11a8b656b169 100644
--- a/tests/cpp/threaded_engine_test.cc
+++ b/tests/cpp/threaded_engine_test.cc
@@ -229,9 +229,3 @@ TEST(Engine, basics) {
   oprs.clear();
   LOG(INFO) << "All pass";
 }
-
-int main(int argc, char ** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  return RUN_ALL_TESTS();
-}
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 6ac2b44fa46c..f2e039529730 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -1,11 +1,21 @@
-TEST_SRC = $(wildcard tests/cpp/*_test.cc)
-TEST = $(patsubst tests/cpp/%_test.cc, tests/cpp/%_test, $(TEST_SRC))
+TEST_SRC = $(wildcard tests/cpp/*.cc)
+TEST_OBJ = $(patsubst %.cc, build/%.o, $(TEST_SRC))
+TEST = build/tests/cpp/mxnet_test
 
 GTEST_LIB=$(GTEST_PATH)/lib/
 GTEST_INC=$(GTEST_PATH)/include/
 
-tests/cpp/% : tests/cpp/%.cc lib/libmxnet.a
-	$(CXX) -std=c++0x $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
-	$(CXX) -std=c++0x $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^) $(LDFLAGS) -L$(GTEST_LIB) -lgtest
+ifeq ($(USE_BREAKPAD), 1)
+CFLAGS  += -I/usr/local/include/breakpad
+LDFLAGS += -lbreakpad_client -lbreakpad
+endif
 
--include tests/cpp/*.d
+build/tests/cpp/%.o : tests/cpp/%.cc
+	@mkdir -p $(@D)
+	$(CXX) -std=c++0x $(CFLAGS) -MM -MT tests/cpp/$* $< > build/tests/cpp/$*.d
+	$(CXX) -c -std=c++0x $(CFLAGS) -I$(GTEST_INC) -o build/tests/cpp/$*.o $(filter %.cc %.a, $^)
+
+$(TEST): $(TEST_OBJ) lib/libmxnet.a
+	$(CXX) -std=c++0x $(CFLAGS) -I$(GTEST_INC) -o $@ $^ $(LDFLAGS) -L$(GTEST_LIB) -lgtest
+
+-include build/tests/cpp/*.d
diff --git a/tests/jenkins/run_as_user.sh b/tests/jenkins/run_as_user.sh
new file mode 100755
index 000000000000..db90f0bd0088
--- /dev/null
+++ b/tests/jenkins/run_as_user.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+# Exit script with error if any errors occur
+set -e
+
+if [[ ! $1 || ! $2 || ! $3 || ! $4 || ! $5 ]];
+then
+    echo "USAGE: " $(basename $"0") "USER_ID USER_NAME GROUP_ID GROUP_NAME SCRIPT"
+    exit 1
+fi
+
+USER_ID=$1
+USER_NAME=$2
+GROUP_ID=$3
+GROUP_NAME=$4
+SCRIPT=$5
+
+HOME_DIR=/home/${USER_NAME}
+
+groupadd -f -g ${GROUP_ID} ${GROUP_NAME}
+useradd -m -u ${USER_ID} -g ${GROUP_NAME} ${USER_NAME}
+chown -R ${USER_NAME}:${GROUP_NAME} ${HOME_DIR}
+chown -R ${USER_NAME}:${GROUP_NAME} /usr/local/lib/
+echo "%${GROUP_NAME}  ALL=(ALL)       NOPASSWD: ALL" >> /etc/sudoers
+su -m ${USER_NAME} -c "export HOME=${HOME_DIR}; ${SCRIPT}"
diff --git a/tests/jenkins/run_test.sh b/tests/jenkins/run_test.sh
index 209ea1e7e830..a8564326443f 100755
--- a/tests/jenkins/run_test.sh
+++ b/tests/jenkins/run_test.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+# Exit script with error if any errors occur
+
 echo "BUILD make"
 cp make/config.mk .
 echo "USE_CUDA=1" >> config.mk
@@ -8,17 +10,17 @@ echo "USE_CUDNN=1" >> config.mk
 echo "USE_PROFILER=1" >> config.mk
 echo "DEV=1" >> config.mk
 echo "EXTRA_OPERATORS=example/ssd/operator" >> config.mk
-make -j$(nproc) || exit -1
+echo "USE_CPP_PACKAGE=1" >> config.mk
 
-echo "BUILD lint"
-make lint || exit -1
+set -e
+
+make -j$(nproc) || exit -1
 
 echo "BUILD cpp_test"
 make -j$(nproc) test || exit -1
 export MXNET_ENGINE_INFO=true
-for test in tests/cpp/*_test; do
-    ./$test || exit -1
-done
+./build/tests/cpp/mxnet_test
+
 export MXNET_ENGINE_INFO=false
 export PYTHONPATH=$(pwd)/python
 
diff --git a/tests/jenkins/run_test_amzn_linux_gpu.sh b/tests/jenkins/run_test_amzn_linux_gpu.sh
index 1f9049f872b1..42c037e67a37 100755
--- a/tests/jenkins/run_test_amzn_linux_gpu.sh
+++ b/tests/jenkins/run_test_amzn_linux_gpu.sh
@@ -1,45 +1,51 @@
 #!/bin/bash
 
+# Exit script with error if any errors occur
+
 echo "BUILD make"
 cp make/config.mk .
 echo "USE_CUDA=0" >> config.mk
 echo "USE_CUDNN=0" >> config.mk
 echo "USE_BLAS=openblas" >> config.mk
+echo "USE_CPP_PACKAGE=1" >> config.mk
 echo "ADD_CFLAGS += -I/usr/include/openblas" >>config.mk
 echo "GTEST_PATH=/usr/local/gtest" >> config.mk
 echo 'export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH' >> ~/.profile
 echo 'export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH' >> ~/.profile
-echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64' >> ~/.profile
-echo 'export JRE_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre' >> ~/.profile
-echo 'export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre/bin' >> ~/.profile
+JAVA_HOME=`/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.*.amzn1.x86_64[-1]`
+echo 'export JAVA_HOME=${JAVA_HOME}' >> ~/.profile
+echo 'export JRE_HOME=${JAVA_HOME}/jre' >> ~/.profile
+echo 'export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:${JAVA_HOME}/bin' >> ~/.profile
 source ~/.profile
 user=`id -u -n`
-make -j 4 || exit -1
 
-echo "BUILD lint"
-make lint || exit -1
+set -e
+
+make -j 4
 
 echo "BUILD cpp_test"
-make -j 4 test || exit -1
+make -j 4 test
 export MXNET_ENGINE_INFO=true
-for test in tests/cpp/*_test; do
-    ./$test || exit -1
-done
+./build/tests/cpp/mxnet_test
+
+echo "BUILD valgrind_test"
+valgrind ./build/tests/cpp/mxnet_test
+
 export MXNET_ENGINE_INFO=false
 export PYTHONPATH=${PWD}/python
 
 echo "BUILD python_test"
-nosetests --verbose tests/python/unittest || exit -1
-nosetests --verbose tests/python/train || exit -1
+nosetests --verbose tests/python/unittest
+nosetests --verbose tests/python/train
 
 echo "BUILD python3_test"
-nosetests3 --verbose tests/python/unittest || exit -1
-nosetests3 --verbose tests/python/train || exit -1
+nosetests3 --verbose tests/python/unittest
+nosetests3 --verbose tests/python/train
 
 #echo "BUILD julia_test"
 #export MXNET_HOME="${PWD}"
 #julia -e 'try Pkg.clone("MXNet"); catch end; Pkg.checkout("MXNet"); Pkg.build("MXNet"); Pkg.test("MXNet")' || exit -1
 
 echo "BUILD scala_test"
-make scalapkg || exit -1
-make scalatest || exit -1
+make scalapkg
+make scalatest
diff --git a/tests/jenkins/run_test_ubuntu.sh b/tests/jenkins/run_test_ubuntu.sh
index 1bdd4018bd74..2e458b52599a 100755
--- a/tests/jenkins/run_test_ubuntu.sh
+++ b/tests/jenkins/run_test_ubuntu.sh
@@ -1,6 +1,21 @@
 #!/bin/bash
 
+set -e
+
 echo "BUILD make"
+
+WITH_CAFFE_PLUGIN=0
+
+if [ "$WITH_CAFFE_PLUGIN" == "1" ]; then
+# Check out caffe
+  git clone https://github.com/BVLC/caffe
+  mkdir -p caffe/build
+  cd caffe/build
+  cmake ..
+  make -j$(nproc)
+  cd ../..
+fi
+
 cp make/config.mk .
 echo "USE_CUDA=1" >> config.mk
 echo "USE_CUDA_PATH=/usr/local/cuda" >> config.mk
@@ -8,8 +23,16 @@ echo "USE_CUDNN=1" >> config.mk
 echo "USE_PROFILER=1" >> config.mk
 echo "DEV=1" >> config.mk
 echo "EXTRA_OPERATORS=example/ssd/operator" >> config.mk
+echo "USE_CPP_PACKAGE=1" >> config.mk
+
+if [ "$WITH_CAFFE_PLUGIN" == "1" ]; then
+    echo "CAFFE_PATH = $(pwd)/caffe" >> config.mk
+    echo "MXNET_PLUGINS += plugin/caffe/caffe.mk" >> config.mk
+fi
+
 user=`id -u -n`
-make -j$(nproc) || exit 1
+
+make -j$(nproc)
 
 export PYTHONPATH=${PWD}/python
 
diff --git a/tests/jenkins/set_user_permissions.sh b/tests/jenkins/set_user_permissions.sh
new file mode 100644
index 000000000000..6f3b640bae5a
--- /dev/null
+++ b/tests/jenkins/set_user_permissions.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# Exit script with error if any errors occur
+set -e
+
+if [[ ! $1 || ! $2 || ! $3 || ! $4 || ! $5 ]];
+then
+    echo "USAGE: " $(basename $"0") "USER_ID USER_NAME GROUP_ID GROUP_NAME SCRIPT"
+    exit 1
+fi
+
+# Call run_as_user.sh script to set basic permission set.
+bash -c tests/jenkins/run_as_user.sh $1 $2 $3 $4 $5
+
+# Now set permissions to run the integration tests.
+# These permission are required to do pip install of certain test dependencies.
+# Example: pip install pyyaml for installing Keras in MXNet-Keras integration test.
+
+USER_NAME=$2
+GROUP_ID=$3
+GROUP_NAME=$4
+SCRIPT=$5
+ 
+if [ -d “/usr/local/lib/“ ]; then
+  chown -R ${USER_NAME}:${GROUP_NAME} /usr/local/lib/
+fi
+if [ -d “/usr/local/lib64/“ ]; then
+  chown -R ${USER_NAME}:${GROUP_NAME} /usr/local/lib64/
+fi
+if [ -d “usr/local/bin/“ ]; then
+  chown -R ${USER_NAME}:${GROUP_NAME} /usr/local/bin/
+fi
+if [ -d “/opt/lib/“ ]; then
+  chown -R ${USER_NAME}:${GROUP_NAME} /opt/lib/
+fi
diff --git a/tests/nightly/TestDoc/doc_spell_checker.py b/tests/nightly/TestDoc/doc_spell_checker.py
new file mode 100644
index 000000000000..20a5c07afdae
--- /dev/null
+++ b/tests/nightly/TestDoc/doc_spell_checker.py
@@ -0,0 +1,161 @@
+#pylint: disable=no-member, too-many-instance-attributes
+"""This script uses pyenchant to check spelling for MXNet
+    documentation website.
+    An exclude list is provided to avoid checking specific word,
+    such as NDArray.
+"""
+
+import os
+import sys
+import re
+from HTMLParser import HTMLParser
+import enchant
+from enchant.checker import SpellChecker
+import grammar_check
+import html2text
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+
+
+GRAMMAR_CHECK_IGNORE = ['WHITESPACE_RULE', 'DOUBLE_PUNCTUATION', 'EN_QUOTES[1]',
+                        'EN_QUOTES[2]', 'COMMA_PARENTHESIS_WHITESPACE',
+                        'ENGLISH_WORD_REPEAT_RULE', 'EN_UNPAIRED_BRACKETS',
+                        'ENGLISH_WORD_REPEAT_BEGINNING_RULE', 'CD_NN[1]',
+                        'UPPERCASE_SENTENCE_START', 'ALL_OF_THE[1]', 'EN_QUOTES[3]',
+                        'THREE_NN[1]', 'HE_VERB_AGR[7]', 'NUMEROUS_DIFFERENT[1]',
+                        'LIFE_TIME[1]', 'PERIOD_OF_TIME[1]', 'WITH_OUT[1]', 'LARGE_NUMBER_OF[1]',
+                        'MANY_NN_U[3]', 'COMP_THAN[3]', 'MASS_AGREEMENT[1]', 'MANY_NN[1]',
+                        'GENERAL_XX[1]', 'EN_A_VS_AN']
+
+
+
+def get_grammar_res(matches):
+    """Filter the grammar check result with ignored check types.
+
+       Parameters
+       -----------
+       matches: list
+       Match result of grammar check
+
+       Return
+       ---------
+       ret: list
+       Filtered result
+    """
+    ret = []
+    for match in matches:
+        lines = str(match).split('\n')
+        lines[0] = lines[0].rstrip()
+        is_ignored = False
+        for entry in GRAMMAR_CHECK_IGNORE:
+            if lines[0].endswith(entry):
+                is_ignored = True
+                break
+        if not is_ignored:
+            ret.append(match)
+    return ret
+
+
+
+def check_doc(file_content, spell_checker, spell_check_ret):
+    """A documentation checker checks spelling
+       of files.
+
+       Parameters
+       -----------
+       content: str
+       source text to be checked
+
+       spell_checker: enchant.checker.SpellChecker
+       Spell checker
+
+       spell_check_res: dict
+       Spell check result dictionary maps typo word to occurance times.
+    """
+    spell_checker.set_text(file_content)
+    for error in spell_checker:
+        if spell_check_ret.has_key(error.word):
+            spell_check_ret[error.word] += 1
+        else:
+            spell_check_ret[error.word] = 1
+
+
+class DocParser(HTMLParser):
+    """A document parser parsed html file and conduct spelling check
+        and grammar check.
+    """
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.__spell_check_res = {}
+        self.__grammar_check_res = None
+        self.__ignore_tag = False
+        self.__is_code_block = False
+        self.__in_code_block = False
+        self.__dictionary = enchant.DictWithPWL('en_US', 'web-data/mxnet/doc/ignored_words.txt')
+        self.__spell_checker = SpellChecker(self.__dictionary)
+        self.__parsed_content = ""
+        self.__grammar_checker = grammar_check.LanguageTool('en-US')
+
+    def handle_starttag(self, tag, attrs):
+        self.__ignore_tag = True if tag.startswith('script') or tag.startswith('option') else False
+
+    def handle_endtag(self, tag):
+        pass
+
+    def handle_data(self, data):
+        #Ignore url content
+        if not self.__ignore_tag and not data.startswith('http'):
+            check_doc(data, self.__spell_checker, self.__spell_check_res)
+
+
+    def get_res(self):
+        """return the checking result
+        """
+        return [self.__spell_check_res, self.__grammar_check_res]
+
+
+    def clear_res(self):
+        """Clean the checking result
+        """
+        self.__spell_check_res = {}
+        self.__grammar_check_res = None
+
+
+    def check_grammar(self, file_name):
+        """Check the grammar of the specified file
+
+           Parameters
+           -----------
+           file_name: name of the file to be checked
+        """
+        file_content = html2text.html2text(open(file_name).read())
+        file_content = re.sub(u"[\x00-\x08\x0b-\x0c\x0e-\x1f]+", u"", file_content)
+        self.__grammar_check_res = self.__grammar_checker.check(file_content)
+
+
+if __name__ == "__main__":
+    BUILD_HTML_DIR = '../../../docs/_build/html'
+    CHINESE_HTML_DIR = '../../../docs/_build/html/zh'
+    STATIC_HTML_DIR = '../../../docs/_build/html/_static'
+    DOC_PARSER = DocParser()
+    ALL_CLEAR = True
+    for root, _, files in os.walk(BUILD_HTML_DIR):
+        if root.startswith(CHINESE_HTML_DIR) or root.startswith(STATIC_HTML_DIR):
+            continue
+        for read_file in files:
+            if not read_file.endswith('.html') or read_file == 'README.html' or '_zh' in read_file:
+                continue
+            rd_file = open(os.path.join(root, read_file), 'r')
+            content = rd_file.read()
+            DOC_PARSER.clear_res()
+            DOC_PARSER.feed(content)
+            DOC_PARSER.check_grammar(os.path.join(root, read_file))
+            spell_check_res = DOC_PARSER.get_res()[0]
+            grammar_check_res = DOC_PARSER.get_res()[1]
+            if len(spell_check_res) > 0:
+                print "%s has typo:" % os.path.join(root, read_file)
+                print "%s\n" % spell_check_res
+                ALL_CLEAR = False
+    if ALL_CLEAR:
+        print "No typo is found."
diff --git a/tests/nightly/TestDoc/doc_spell_grammar.sh b/tests/nightly/TestDoc/doc_spell_grammar.sh
new file mode 100755
index 000000000000..05b0be138fd6
--- /dev/null
+++ b/tests/nightly/TestDoc/doc_spell_grammar.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+echo "BUILD make"
+cp ./make/config.mk .
+echo "USE_CUDA=0" >> ./config.mk
+echo "USE_CUDNN=0" >> ./config.mk
+echo "USE_BLAS=openblas" >> ./config.mk
+echo "ADD_CFLAGS += -I/usr/include/openblas" >> ./config.mk
+echo "GTEST_PATH=/usr/local/gtest" >> ./config.mk
+echo 'export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH' >> ~/.profile
+echo 'export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH' >> ~/.profile
+echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64' >> ~/.profile
+echo 'export JRE_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre' >> ~/.profile
+echo 'export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre/bin' >> ~/.profile
+source ~/.profile
+make clean
+make -j 4 || exit -1
+
+echo "Add python path"
+export PYTHONPATH=$PYTHONPATH:python
+
+
+echo "BUILD mxnet document"
+cd docs
+make html
+
+echo "Check spell and grammar for documentation"
+cd ../tests/nightly/TestDoc
+rm -rf web-data
+git clone https://github.com/dmlc/web-data.git
+sudo cp web-data/mxnet/doc/en_US-large.aff web-data/mxnet/doc/en_US-large.dic web-data/mxnet/doc/en_US.aff web-data/mxnet/doc/en_US.dic /usr/share/myspell
+python doc_spell_checker.py || exit 1
+
+echo "Check spell and grammar End"
diff --git a/tests/nightly/mxnet_keras_integration_tests/assertion_util.py b/tests/nightly/mxnet_keras_integration_tests/assertion_util.py
new file mode 100644
index 000000000000..1fad6a1eb326
--- /dev/null
+++ b/tests/nightly/mxnet_keras_integration_tests/assertion_util.py
@@ -0,0 +1,62 @@
+
+from nose.tools import assert_true
+
+def assert_results(MACHINE_TYPE, IS_GPU, GPU_NUM, profile_output, CPU_BENCHMARK_RESULTS, GPU_1_BENCHMARK_RESULTS, GPU_2_BENCHMARK_RESULTS, GPU_4_BENCHMARK_RESULTS, GPU_8_BENCHMARK_RESULTS):
+    """
+        Helps in asserting benchmarking results.
+        Compares actual output result in profile_output with expected result in
+        CPU_BENCHMARK_RESULTS if IS_GPU is True.
+        Else, compares with GPU_1_BENCHMARK_RESULTS, GPU_2_BENCHMARK_RESULTS
+        GPU_4_BENCHMARK_RESULTS and GPU_8_BENCHMARK_RESULTS.
+
+        Uses keys - MODEL, TRAINING_TIME, MEM_CONSUMPTION, TRAIN_ACCURACY and TEST_ACCURACY
+        to fetch data from provided actual and expected results input map stated above.
+    """
+    # Model type
+    model = profile_output['MODEL']
+
+    # Actual values.
+    actual_training_time = profile_output['TRAINING_TIME']
+    actual_memory_consumption = profile_output['MEM_CONSUMPTION']
+    actual_train_accuracy = profile_output['TRAIN_ACCURACY']
+    actual_test_accuracy = profile_output['TEST_ACCURACY']
+
+    # Expected values
+    expected_training_time = 0.0
+    expected_memory_consumption = 0.0
+    expected_train_accuracy = 1.0
+    expected_test_accuracy = 1.0
+
+    # Set right set of expected values based on current run type
+    if(IS_GPU):
+        if GPU_NUM == 1:
+            expected_training_time = GPU_1_BENCHMARK_RESULTS['TRAINING_TIME']
+            expected_memory_consumption = GPU_1_BENCHMARK_RESULTS['MEM_CONSUMPTION']
+            expected_train_accuracy = GPU_1_BENCHMARK_RESULTS['TRAIN_ACCURACY']
+            expected_test_accuracy = GPU_1_BENCHMARK_RESULTS['TEST_ACCURACY']
+        elif GPU_NUM == 2:
+            expected_training_time = GPU_2_BENCHMARK_RESULTS['TRAINING_TIME']
+            expected_memory_consumption = GPU_2_BENCHMARK_RESULTS['MEM_CONSUMPTION']
+            expected_train_accuracy = GPU_2_BENCHMARK_RESULTS['TRAIN_ACCURACY']
+            expected_test_accuracy = GPU_2_BENCHMARK_RESULTS['TEST_ACCURACY']
+        elif GPU_NUM == 4:
+            expected_training_time = GPU_4_BENCHMARK_RESULTS['TRAINING_TIME']
+            expected_memory_consumption = GPU_4_BENCHMARK_RESULTS['MEM_CONSUMPTION']
+            expected_train_accuracy = GPU_4_BENCHMARK_RESULTS['TRAIN_ACCURACY']
+            expected_test_accuracy = GPU_4_BENCHMARK_RESULTS['TEST_ACCURACY']
+        elif GPU_NUM == 8:
+            expected_training_time = GPU_8_BENCHMARK_RESULTS['TRAINING_TIME']
+            expected_memory_consumption = GPU_8_BENCHMARK_RESULTS['MEM_CONSUMPTION']
+            expected_train_accuracy = GPU_8_BENCHMARK_RESULTS['TRAIN_ACCURACY']
+            expected_test_accuracy = GPU_8_BENCHMARK_RESULTS['TEST_ACCURACY']
+    else:
+        expected_training_time = CPU_BENCHMARK_RESULTS['TRAINING_TIME']
+        expected_memory_consumption = CPU_BENCHMARK_RESULTS['MEM_CONSUMPTION']
+        expected_train_accuracy = CPU_BENCHMARK_RESULTS['TRAIN_ACCURACY']
+        expected_test_accuracy = CPU_BENCHMARK_RESULTS['TEST_ACCURACY']
+
+    # Validate Results
+    assert_true(actual_training_time < expected_training_time,'{0} on {1} machine with {2} GPU usage FAILED. Expected Training Time - {3} secs but was {4} secs.'.format(model, MACHINE_TYPE, GPU_NUM, expected_training_time, actual_training_time))
+    assert_true(actual_memory_consumption < expected_memory_consumption, '{0} on {1} machine with {2} GPU usage FAILED. Expected Mem Consumption - {3} MB but was {4} MB.'.format(model, MACHINE_TYPE, GPU_NUM, expected_memory_consumption, actual_memory_consumption))
+    assert_true(actual_train_accuracy > expected_train_accuracy, '{0} on {1} machine with {2} GPU usage FAILED. Expected Train Accuracy - {3} but was {4}.'.format(model, MACHINE_TYPE, GPU_NUM, expected_train_accuracy, actual_train_accuracy))
+    assert_true(actual_test_accuracy > expected_test_accuracy, '{0} on {1} machine with {2} GPU usage FAILED. Expected Test Accuracy - {3} but was {4}.'.format(model, MACHINE_TYPE, GPU_NUM, expected_test_accuracy, actual_test_accuracy))
diff --git a/tests/nightly/mxnet_keras_integration_tests/model_util.py b/tests/nightly/mxnet_keras_integration_tests/model_util.py
new file mode 100644
index 000000000000..9f73ab60b062
--- /dev/null
+++ b/tests/nightly/mxnet_keras_integration_tests/model_util.py
@@ -0,0 +1,51 @@
+import os
+from keras import backend as K
+from keras.models import Model
+from keras.layers import Input, merge
+from keras.layers.core import Lambda
+
+# Before running the integration tests, users are expected to set these
+# environment variables.
+IS_GPU = (os.environ['MXNET_KERAS_TEST_MACHINE'] == 'GPU')
+GPU_NUM = int(os.environ['GPU_NUM']) if IS_GPU else 0
+KERAS_BACKEND = os.environ['KERAS_BACKEND']
+
+def slice_batch(x, n_gpus, part):
+    sh = K.shape(x)
+    L = sh[0] / n_gpus
+    if part == n_gpus - 1:
+        return x[part*L:]
+    return x[part*L:(part+1)*L]
+
+def prepare_gpu_model(model, **kwargs):
+    gpu_list = []
+    for i in range(GPU_NUM):
+        gpu_list.append('gpu(%d)' % i)
+    if KERAS_BACKEND == 'mxnet':
+        kwargs['context'] = gpu_list
+        model.compile(**kwargs)
+    else:
+        model.compile(**kwargs)
+
+def prepare_cpu_model(model, **kwargs):
+    model.compile(**kwargs)
+
+def make_model(model, **kwargs):
+    """
+        Compiles the Keras Model object for given backend type and machine type.
+        Use this function to write one Keras code and run it across different machine type.
+
+        If environment variable - MXNET_KERAS_TEST_MACHINE is set to CPU, then Compiles
+        Keras Model for running on CPU.
+
+        If environment variable - MXNET_KERAS_TEST_MACHINE is set to GPU, then Compiles
+        Keras Model running on GPU using number of GPUs equal to number specified in
+        GPU_NUM environment variable.
+
+        Currently supports only MXNet as Keras backend.
+    """
+    if(IS_GPU):
+        prepare_gpu_model(model, **kwargs)
+    else:
+        prepare_cpu_model(model, **kwargs)
+    return model
diff --git a/tests/nightly/mxnet_keras_integration_tests/profiler.py b/tests/nightly/mxnet_keras_integration_tests/profiler.py
new file mode 100644
index 000000000000..4b6446a9b8cc
--- /dev/null
+++ b/tests/nightly/mxnet_keras_integration_tests/profiler.py
@@ -0,0 +1,96 @@
+import os
+import signal
+import time
+import csv
+import subprocess
+from memory_profiler import memory_usage
+
+IS_GPU = (os.environ['MXNET_KERAS_TEST_MACHINE'] == 'GPU')
+GPU_NUM = int(os.environ['GPU_NUM']) if IS_GPU else 0
+
+# This command is useful to fetch GPU memory consumption.
+GPU_MONITOR_CMD = "nvidia-smi --query-gpu=index,memory.used --format=csv -lms 500 -f output.csv"
+
+def cpu_memory_profile(func_to_profile):
+    max_mem_usage = memory_usage(proc=(func_to_profile, ()), max_usage=True)
+    return max_mem_usage[0]
+
+def gpu_mem_profile(file_name):
+    row_count = 0
+    # In MBs
+    max_mem_usage = 0
+    with open(file_name, 'r') as csv_file:
+        csv_reader = csv.reader(csv_file)
+        last_line_broken = False
+        for row in csv_reader:
+            if row_count == 0:
+                row_count += 1
+                continue
+            if len(row) < 2 or not 'MiB' in row[1]:
+                last_line_broken = True
+            row_count += 1
+        row_count -= 1
+        if row_count % GPU_NUM == 0 and last_line_broken:
+            row_count -= GPU_NUM
+        else:
+            row_count -= row_count % GPU_NUM
+
+    with open(file_name, 'r') as csv_file:
+        csv_reader = csv.reader(csv_file)
+        current_usage = 0
+        mem_recoder = [0] * GPU_NUM
+        row_num = 0
+        for row in csv_reader:
+            if row_num == 0:
+                row_num += 1
+                continue
+            mem_str = row[1].lstrip().rstrip()[:-4]
+            mem_num = float(mem_str)
+            current_usage += mem_num
+            mem_recoder[(row_num - 1) % GPU_NUM] += mem_num
+            if row_num % GPU_NUM == 0:
+                max_mem_usage = max(max_mem_usage, current_usage)
+                current_usage = 0
+            row_num += 1
+            if row_num > row_count:
+                break
+        row_num -= 1
+    os.remove(file_name)
+    return max_mem_usage
+
+def profile(func_to_profile):
+    """
+        This function helps in profile given func_to_profile for run-time and
+        memory consumption.
+
+        Capable of profile for both GPU and CPU machine.
+
+        Uses environment variable - IS_GPU to identify whether to profile for
+        CPU or GPU.
+
+        returns: run_time, memory_usage
+    """
+    run_time = 0; # Seconds
+    memory_usage = 0; # MBs
+
+    # Choose nvidia-smi or memory_profiler for memory profiling for GPU and CPU
+    # machines respectively.
+    if(IS_GPU):
+        # Start time - For timing the runtime
+        start_time = time.time()
+        open('nvidia-smi-output.csv', 'a').close()
+        gpu_monitor_process = subprocess.Popen(GPU_MONITOR_CMD,
+                                                  shell=True, preexec_fn=os.setsid)
+        func_to_profile()
+        end_time = time.time()
+        os.killpg(os.getpgid(gpu_monitor_process.pid), signal.SIGTERM)
+        run_time = end_time - start_time
+        memory_usage = gpu_mem_profile('nvidia-smi-output.csv')
+    else:
+        # Start time - For timing the runtime
+        start_time = time.time()
+        memory_usage = cpu_memory_profile(func_to_profile)
+        end_time = time.time()
+        run_time = end_time - start_time
+
+    return run_time, memory_usage
diff --git a/tests/nightly/mxnet_keras_integration_tests/test_mnist_mlp.py b/tests/nightly/mxnet_keras_integration_tests/test_mnist_mlp.py
new file mode 100644
index 000000000000..7a0c6298d736
--- /dev/null
+++ b/tests/nightly/mxnet_keras_integration_tests/test_mnist_mlp.py
@@ -0,0 +1,97 @@
+'''
+This code is forked from https://github.com/fchollet/keras/blob/master/examples/mnist_mlp.py
+and modified to use as MXNet-Keras integration testing for functionality and sanity performance
+benchmarking.
+
+Trains a simple deep NN on the MNIST dataset.
+
+Gets to 98.40% test accuracy after 20 epochs
+(there is *a lot* of margin for parameter tuning).
+2 seconds per epoch on a K520 GPU.
+'''
+
+from __future__ import print_function
+import numpy as np
+np.random.seed(1337)  # for reproducibility
+
+from os import environ
+
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers.core import Dense, Dropout, Activation
+from keras.optimizers import SGD
+from keras.utils import np_utils
+
+# Imports for benchmarking
+from profiler import profile
+from model_util import make_model
+
+# Imports for assertions
+from assertion_util import assert_results
+
+# Other environment variables
+MACHINE_TYPE = environ['MXNET_KERAS_TEST_MACHINE']
+IS_GPU = (environ['MXNET_KERAS_TEST_MACHINE'] == 'GPU')
+MACHINE_TYPE = 'GPU' if IS_GPU else 'CPU'
+GPU_NUM = int(environ['GPU_NUM']) if IS_GPU else 0
+
+# Expected Benchmark Numbers
+CPU_BENCHMARK_RESULTS = {'TRAINING_TIME':550.0, 'MEM_CONSUMPTION':400.0, 'TRAIN_ACCURACY': 0.85, 'TEST_ACCURACY':0.85}
+GPU_1_BENCHMARK_RESULTS = {'TRAINING_TIME':40.0, 'MEM_CONSUMPTION':200, 'TRAIN_ACCURACY': 0.85, 'TEST_ACCURACY':0.85}
+# TODO: Fix Train and Test accuracy numbers in multiple gpu mode. Setting it to 0 for now to get whole integration set up done
+GPU_2_BENCHMARK_RESULTS = {'TRAINING_TIME':45.0, 'MEM_CONSUMPTION':375, 'TRAIN_ACCURACY': 0.0, 'TEST_ACCURACY':0.0}
+GPU_4_BENCHMARK_RESULTS = {'TRAINING_TIME':55.0, 'MEM_CONSUMPTION':750.0, 'TRAIN_ACCURACY': 0.0, 'TEST_ACCURACY':0.0}
+GPU_8_BENCHMARK_RESULTS = {'TRAINING_TIME':100.0, 'MEM_CONSUMPTION':1800.0, 'TRAIN_ACCURACY': 0.0, 'TEST_ACCURACY':0.0}
+
+# Dictionary to store profiling output
+profile_output = {}
+
+batch_size = 128
+nb_classes = 10
+nb_epoch = 20
+
+# the data, shuffled and split between train and test sets
+(X_train, y_train), (X_test, y_test) = mnist.load_data()
+
+X_train = X_train.reshape(60000, 784)
+X_test = X_test.reshape(10000, 784)
+X_train = X_train.astype('float32')
+X_test = X_test.astype('float32')
+X_train /= 255
+X_test /= 255
+
+# convert class vectors to binary class matrices
+Y_train = np_utils.to_categorical(y_train, nb_classes)
+Y_test = np_utils.to_categorical(y_test, nb_classes)
+
+model = Sequential()
+model.add(Dense(512, input_shape=(784,)))
+model.add(Activation('relu'))
+model.add(Dropout(0.2))
+model.add(Dense(512))
+model.add(Activation('relu'))
+model.add(Dropout(0.2))
+model.add(Dense(10))
+model.add(Activation('softmax'))
+
+model.summary()
+make_model(model, loss='categorical_crossentropy', optimizer=SGD(), metrics=['accuracy'])
+
+def train_model():
+    history = model.fit(X_train, Y_train,
+                    batch_size=batch_size, nb_epoch=nb_epoch,
+                    verbose=1, validation_data=(X_test, Y_test))
+    profile_output['TRAIN_ACCURACY'] = history.history['acc'][-1]
+
+def test_run():
+    # Calling training and profile memory usage
+    profile_output["MODEL"] = "MNIST MLP"
+    run_time, memory_usage = profile(train_model)
+
+    profile_output['TRAINING_TIME'] = float(run_time)
+    profile_output['MEM_CONSUMPTION'] = float(memory_usage)
+
+    score = model.evaluate(X_test, Y_test, verbose=0)
+    profile_output["TEST_ACCURACY"] = score[1]
+
+    assert_results(MACHINE_TYPE, IS_GPU, GPU_NUM, profile_output, CPU_BENCHMARK_RESULTS, GPU_1_BENCHMARK_RESULTS, GPU_2_BENCHMARK_RESULTS, GPU_4_BENCHMARK_RESULTS, GPU_8_BENCHMARK_RESULTS)
diff --git a/tests/nightly/test_mxnet_keras_integration_cpu.sh b/tests/nightly/test_mxnet_keras_integration_cpu.sh
new file mode 100644
index 000000000000..25a1da4ddf46
--- /dev/null
+++ b/tests/nightly/test_mxnet_keras_integration_cpu.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+set -e
+### Build MXNet with CPU support
+echo "BUILD make"
+cp ./make/config.mk .
+echo "USE_CUDA=0" >> ./config.mk
+echo "USE_CUDNN=0" >> ./config.mk
+echo "USE_BLAS=openblas" >> ./config.mk
+echo "ADD_CFLAGS += -I/usr/include/openblas" >> ./config.mk
+echo "GTEST_PATH=/usr/local/gtest" >> ./config.mk
+echo 'export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH' >> ~/.profile
+echo 'export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH' >> ~/.profile
+echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64' >> ~/.profile
+echo 'export JRE_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre' >> ~/.profile
+echo 'export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre/bin' >> ~/.profile
+source ~/.profile
+make clean
+make -j 4 || exit -1
+
+echo "BUILD python2 mxnet"
+cd ./python
+python setup.py install || exit 1
+
+echo "BUILD python3 mxnet"
+python3 setup.py install || exit 1
+
+# Come out of Mxnet directory.
+cd ..
+
+# Required for Keras installation
+pip install pyyaml
+
+# If already exist remove and fork DMLC/keras and install.
+# Note: This should eventually be replaced with pip install when mxnet backend is part of fchollet/keras
+
+########### Set up Keras ####################
+echo "Installing Keras. This can take few minutes..."
+# Clone keras repository from dmlc. This has mxnet backend implementated.
+if [ -d "keras" ]; then
+  rm -rf keras/
+fi
+
+git clone https://github.com/dmlc/keras.git --recursive
+cd keras
+python setup.py install
+
+########### Set up packages for profiling #########
+echo "Installing memory_profile and psutil for profiling. This can take few minutes..."
+pip install memory_profiler
+pip install psutil
+
+########## Set Environment Variables ########
+echo "Setting Environment Variables for MXNet Keras Integration Tests on CPU machine"
+export KERAS_BACKEND="mxnet"
+export MXNET_KERAS_TEST_MACHINE='CPU'
+
+########## Call the test script ############
+cd ../../mxnet/tests/nightly
+echo "Running MXNet Keras Integration Test on CPU machine"
+nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
diff --git a/tests/nightly/test_mxnet_keras_integration_gpu.sh b/tests/nightly/test_mxnet_keras_integration_gpu.sh
new file mode 100644
index 000000000000..86fb37acfc47
--- /dev/null
+++ b/tests/nightly/test_mxnet_keras_integration_gpu.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+set -e
+
+### Install git
+apt-get update
+apt-get install git-all
+
+### Build MXNet with CPU support
+echo "BUILD make"
+cp ./make/config.mk .
+echo "USE_CUDA=1" >> ./config.mk
+echo "USE_CUDA_PATH=/usr/local/cuda" >> config.mk
+echo "USE_CUDNN=1" >> ./config.mk
+echo "USE_BLAS=openblas" >> ./config.mk
+echo "ADD_CFLAGS += -I/usr/include/openblas" >> ./config.mk
+echo "GTEST_PATH=/usr/local/gtest" >> ./config.mk
+export PKG_CONFIG_PATH=/usr/local/lib/pkgconfig:$PKG_CONFIG_PATH
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64
+export JRE_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre
+export PATH=$PATH:/apache-maven-3.3.9/bin/:/usr/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/bin:/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.111-1.b15.25.amzn1.x86_64/jre/bin
+
+make clean
+make -j 4 || exit -1
+
+echo "BUILD python2 mxnet"
+cd ./python
+python setup.py install || exit 1
+
+echo "BUILD python3 mxnet"
+python3 setup.py install || exit 1
+
+# Come out of MXNet directory
+cd ..
+
+# Dependencies required for Keras installation
+pip install pyyaml
+
+pip install --upgrade pip
+pip install --upgrade six
+
+# If already exist remove and fork DMLC/keras and install.
+# Note: This should eventually be replaced with pip install when mxnet backend is part of fchollet/keras
+
+########### Set up Keras ####################
+echo "Installing Keras. This can take few minutes..."
+# Clone keras repository from dmlc. This has mxnet backend implementated.
+if [ -d "keras" ]; then
+  rm -rf keras/
+fi
+
+git clone https://github.com/dmlc/keras.git --recursive
+cd keras
+python setup.py install
+
+########### Set up packages for profiling #########
+echo "Installing memory_profile and psutil for profiling. This can take few minutes..."
+pip install memory_profiler
+pip install psutil
+
+########## Set Environment Variables ########
+echo "Setting Environment Variables for MXNet Keras Integration Tests on CPU machine"
+cd ../../mxnet/tests/nightly
+
+export KERAS_BACKEND="mxnet"
+export MXNET_KERAS_TEST_MACHINE='GPU'
+########## Call the test script with 1 GPUS ############
+
+export GPU_NUM='1'
+echo "Running MXNet Keras Integration Test on GPU machine with 1 GPUs"
+nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
+
+########## Call the test script with 2 GPUS ############
+
+export GPU_NUM='2'
+echo "Running MXNet Keras Integration Test on GPU machine with 2 GPUs"
+nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
+
+########## Call the test script with 4 GPUS ############
+
+export GPU_NUM='4'
+echo "Running MXNet Keras Integration Test on GPU machine with 4 GPUs"
+nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
+
+########## Call the test script with 8 GPUS ############
+
+export GPU_NUM='8'
+echo "Running MXNet Keras Integration Test on GPU machine with 8 GPUs"
+nosetests --with-xunit --quiet --nologcapture mxnet_keras_integration_tests/
diff --git a/tests/python/common/get_data.py b/tests/python/common/get_data.py
index 38bb09c44ff7..e385a7186848 100644
--- a/tests/python/common/get_data.py
+++ b/tests/python/common/get_data.py
@@ -8,7 +8,7 @@ def GetMNIST_pkl():
     if not os.path.isdir("data/"):
         os.system("mkdir data/")
     if not os.path.exists('data/mnist.pkl.gz'):
-        os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz -P data/")
+        os.system("wget -q http://deeplearning.net/data/mnist/mnist.pkl.gz -P data/")
 
 # download ubyte version of mnist and untar
 def GetMNIST_ubyte():
@@ -18,7 +18,7 @@ def GetMNIST_ubyte():
        (not os.path.exists('data/train-labels-idx1-ubyte')) or \
        (not os.path.exists('data/t10k-images-idx3-ubyte')) or \
        (not os.path.exists('data/t10k-labels-idx1-ubyte')):
-        os.system("wget http://data.mxnet.io/mxnet/data/mnist.zip -P data/")
+        os.system("wget -q http://data.mxnet.io/mxnet/data/mnist.zip -P data/")
         os.chdir("./data")
         os.system("unzip -u mnist.zip")
         os.chdir("..")
@@ -31,7 +31,7 @@ def GetCifar10():
        (not os.path.exists('data/cifar/test.rec')) or \
        (not os.path.exists('data/cifar/train.lst')) or \
        (not os.path.exists('data/cifar/test.lst')):
-        os.system("wget http://data.mxnet.io/mxnet/data/cifar10.zip -P data/")
+        os.system("wget -q http://data.mxnet.io/mxnet/data/cifar10.zip -P data/")
         os.chdir("./data")
         os.system("unzip -u cifar10.zip")
         os.chdir("..")
diff --git a/tests/python/doctest/run.py b/tests/python/doctest/test_docstring.py
similarity index 74%
rename from tests/python/doctest/run.py
rename to tests/python/doctest/test_docstring.py
index b69b9eb982ae..e457e7b9ca55 100644
--- a/tests/python/doctest/run.py
+++ b/tests/python/doctest/test_docstring.py
@@ -28,13 +28,18 @@ def import_into(globs, module, names=None, error_on_overwrite=True):
 
 
 def test_symbols():
-    globs = {'numpy': numpy, 'mxnet': mxnet, 'test_utils': mxnet.test_utils}
+    globs = {'np': numpy, 'mx': mxnet, 'test_utils': mxnet.test_utils, 'SymbolDoc': mxnet.symbol_doc.SymbolDoc}
 
     # make sure all the operators are available
     import_into(globs, mxnet.symbol)
+    doctest.testmod(mxnet.symbol_doc, globs=globs, verbose=True)
 
-    doctest.testmod(mxnet.symbol_doc, globs=globs)
+def test_ndarray():
+    globs = {'np': numpy, 'mx': mxnet}
+
+    doctest.testmod(mxnet.ndarray, globs=globs, verbose=True)
 
 
 if __name__ == '__main__':
     test_symbols()
+    test_ndarray()
diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
index aba2354ea347..dc2c129f5326 100644
--- a/tests/python/gpu/test_forward.py
+++ b/tests/python/gpu/test_forward.py
@@ -1,19 +1,14 @@
 import os
-import scipy as sp
 import numpy as np
 import mxnet as mx
 from mxnet.test_utils import *
 
-def GetModel():
-    if not os.path.isdir("model/"):
-        os.system("mkdir model/")
-    if not os.path.exists('model/inception-v3.tar.gz'):
-        os.system("wget http://data.mxnet.io/models/imagenet/inception-v3.tar.gz -P model/")
-        os.chdir("./model")
-        os.system("tar -xf inception-v3.tar.gz --strip-components 1")
-        os.chdir("..")
+def _get_model():
+    if not os.path.exists('model/Inception-7-symbol.json'):
+        download('http://data.mxnet.io/models/imagenet/inception-v3.tar.gz', dirname='model')
+        os.system("cd model; tar -xf inception-v3.tar.gz --strip-components 1")
 
-def DumpImages(shape):
+def _dump_images(shape):
     import skimage.io
     import skimage.transform
     img_list = []
@@ -28,20 +23,16 @@ def DumpImages(shape):
     imgs = np.asarray(img_list, dtype=np.float32).transpose((0, 3, 1, 2)) - 128
     np.save('data/test_images_%d_%d.npy'%shape, imgs)
 
-def GetTestData(shape):
-    if not os.path.isdir("data/"):
-        os.system("mkdir data/")
-    if not os.path.exists('data/test_images_%d_%d.npy'%shape):
-        os.system("wget http://data.mxnet.io/data/test_images_%d_%d.npy -P data/"%shape)
-    if not os.path.exists('data/inception-v3-dump.npz'):
-        os.system("wget http://data.mxnet.io/data/inception-v3-dump.npz -P data/")
+def _get_data(shape):
+    download("http://data.mxnet.io/data/test_images_%d_%d.npy" % (shape), dirname='data')
+    download("http://data.mxnet.io/data/inception-v3-dump.npz", dirname="data")
 
 def test_consistency(dump=False):
     shape = (299, 299)
-    GetModel()
-    GetTestData(shape)
+    _get_model()
+    _get_data(shape)
     if dump:
-        DumpImages(shape)
+        _dump_images(shape)
         gt = None
     else:
         gt = {n: mx.nd.array(a) for n, a in np.load('data/inception-v3-dump.npz').items()}
@@ -57,5 +48,4 @@ def test_consistency(dump=False):
         np.savez('data/inception-v3-dump.npz', **{n: a.asnumpy() for n, a in gt.items()})
 
 if __name__ == '__main__':
-    #test_forward_inception()
     test_consistency(False)
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index dd9f9dd65b01..a4264412f5f4 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1,4 +1,4 @@
-﻿import sys
+import sys
 import os
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
@@ -14,6 +14,226 @@
 del test_support_vector_machine_l1_svm
 del test_support_vector_machine_l2_svm
 
+
+def check_countsketch(in_dim,out_dim,n):
+    sym = mx.contrib.sym.count_sketch(name='countsketch',out_dim = out_dim)
+    shape = [(n,in_dim), (1,in_dim),(1,in_dim)]     #shape of input x, hash h and hash s
+
+    arr = [mx.nd.empty(shape[i]) for i in range(3)]
+    arr_grad = [mx.nd.empty(shape[i]) for i in range(3)]
+    x = np.random.uniform(-10, 10, shape[0])
+    arr[0][:] = x                                 #input x
+    h = np.random.randint(0, out_dim, shape[1])
+    arr[1][:] = h                                 #hash h
+    s = np.random.randint(0, 2, shape[2])*2-np.ones(shape[2])
+    arr[2][:] = s                                 #hash s
+    # forward
+    exe_list = [sym.bind(mx.gpu(0), arr, arr_grad)]
+    for exe in exe_list:
+        exe.forward(is_train= True)
+    out1 = [exe.outputs[0].asnumpy() for exe in exe_list]
+    
+    a = np.zeros((n,out_dim))
+    temp = np.multiply(x, s)
+    for num_sample in np.arange(0,n):
+        for idx in np.arange(0,in_dim):
+            a[num_sample][h[0][idx]] += temp[num_sample][idx]
+    assert_almost_equal(a,out1[0],rtol=1e-3, atol=1e-12)
+    
+    # backward
+    out_grad = mx.nd.empty((n,out_dim))
+    out_grad[:] = np.random.normal(-3, 3, (n,out_dim))
+    for exe in exe_list:
+        exe.backward([out_grad])  
+    
+        a = np.zeros((n,in_dim))
+        for j in np.arange(0,n):
+            for i in np.arange(0,in_dim):
+                a[j,i] = out_grad.asnumpy()[j, h[0,i]] * s[0,i]
+    assert_almost_equal(a,arr_grad[0].asnumpy(),rtol=1e-3, atol=1e-12)
+    
+def test_countsketch():
+    np.random.seed(0)
+    nrepeat = 2
+    minindim = 40
+    maxindim = 100
+    minoutdim = 5
+    maxoutdim = 30
+    maxn = 200
+    for repeat in range(nrepeat):
+        in_dim = np.random.randint(minindim, maxindim)
+        out_dim = np.random.randint(minoutdim, maxoutdim)
+        n = np.random.randint(1,maxn)
+        check_countsketch(in_dim, out_dim, n)
+
+def check_ifft(shape):
+    shape_old = shape
+    if len(shape) == 2:
+        if shape[1]%2 != 0:
+            lst = list(shape)
+            lst[1] = lst[1]*2
+            shape = tuple(lst)
+            shape_old = shape
+        shape = (shape[0],shape[1]*2)
+    if len(shape) == 4:
+        if shape[3]%2 != 0:
+            lst = list(shape)
+            lst[3] = lst[3]*2
+            shape = tuple(lst)
+            shape_old = shape
+        shape = (shape[0],shape[1],shape[2],shape[3]*2)
+    sym = mx.contrib.sym.ifft(name='ifft', compute_size = 128)
+    init = [np.random.normal(size=shape, scale=1.0)]
+    arr_grad = [mx.nd.empty(shape)]
+    ctx_list = [{'ctx': mx.gpu(0),'ifft_data': shape, 'type_dict': {'ifft_data': np.float32}}]
+    exe_list = [sym.simple_bind(args_grad=arr_grad,**ctx) for ctx in ctx_list]
+    
+    for exe in exe_list:
+        for arr, iarr in zip(exe.arg_arrays, init):
+            arr[:] = iarr.astype(arr.dtype)
+    # forward
+    for exe in exe_list:
+        exe.forward(is_train= True)
+        out1 = [exe.outputs[0].asnumpy() for exe in exe_list]
+    
+    if len(shape) == 2:
+        init_complex = np.zeros(shape_old,dtype = np.complex64)
+        for i in range(0,shape_old[1]):
+            init_complex.real[:,i] = init[0][:,2*i]
+            init_complex.imag[:,i] = init[0][:,2*i+1]
+        a = np.fft.ifft(init_complex, n=None, axis=-1, norm=None)
+        assert_almost_equal(a.real, out1[0]/shape_old[1],rtol=1e-3, atol=1e-12)
+    
+    if len(shape) == 4:
+        init_complex = np.zeros(shape_old,dtype = np.complex64)
+        for i in range(0,shape_old[3]):
+            init_complex.real[:,:,:,i] = init[0][:,:,:,2*i]
+            init_complex.imag[:,:,:,i] = init[0][:,:,:,2*i+1]
+        a = np.fft.ifft(init_complex, n=None, axis=-1, norm=None)
+        assert_almost_equal(a.real, out1[0]/shape_old[3],rtol=1e-3, atol=1e-12)
+    # backward
+    if len(shape) == 2:
+        out_grad = mx.nd.empty(shape_old)
+        out_grad[:] = np.random.normal(-3, 3, shape_old)
+        for exe in exe_list:
+            exe.backward([out_grad])
+            temp = exe.grad_arrays[0].asnumpy()
+            temp = np.zeros(shape_old)
+            for i in range(shape_old[1]):
+                temp[:,i] = exe.grad_arrays[0].asnumpy()[:,2*i]
+                
+        a = np.fft.fft(out_grad.asnumpy(), n=None, axis=-1, norm=None)
+        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-12)
+    if len(shape) == 4:
+        out_grad = mx.nd.empty(shape_old)
+        out_grad[:] = np.random.normal(-3, 3, shape_old)
+        for exe in exe_list:
+            exe.backward([out_grad])
+            temp = exe.grad_arrays[0].asnumpy()
+            temp = np.zeros(shape_old)
+            for i in range(shape_old[3]):
+                temp[:,:,:,i] = exe.grad_arrays[0].asnumpy()[:,:,:,2*i]
+                
+        a = np.fft.fft(out_grad.asnumpy(), n=None, axis=-1, norm=None)
+        assert_almost_equal(a.real, temp, rtol=1e-3, atol=1e-12)
+             
+    
+def test_ifft():
+    np.random.seed(0)
+    nrepeat = 2
+    maxdim = 10
+    for repeat in range(nrepeat):
+        for order in [2,4]:
+            shape = tuple(np.random.randint(1, maxdim, size=order))
+            check_ifft(shape)
+
+def check_fft(shape):
+    sym = mx.contrib.sym.fft(name='fft', compute_size = 128)
+    if len(shape) == 2:
+        if shape[1]%2 != 0:
+            lst = list(shape)
+            lst[1] = lst[1]*2
+            shape = tuple(lst)
+            shape_old = shape
+    if len(shape) == 4:
+        if shape[3]%2 != 0:
+            lst = list(shape)
+            lst[3] = lst[3]*2
+            shape = tuple(lst)
+            shape_old = shape
+    init = [np.random.normal(size=shape, scale=1.0)]
+    arr_grad = [mx.nd.empty(shape)]
+    ctx_list = [{'ctx': mx.gpu(0),'fft_data': shape, 'type_dict': {'fft_data': np.float32}}]
+    exe_list = [sym.simple_bind(args_grad=arr_grad,**ctx) for ctx in ctx_list]
+    
+    for exe in exe_list:
+        for arr, iarr in zip(exe.arg_arrays, init):
+            arr[:] = iarr.astype(arr.dtype)
+    #forward
+    for exe in exe_list:
+        exe.forward(is_train=True)
+    out1 = [exe.outputs[0].asnumpy() for exe in exe_list]
+    out = np.fft.fft(init, n=None, axis=-1, norm=None)
+    if len(shape) == 2:
+        out = np.reshape(out,(out.shape[1],out.shape[2]))
+        out2 = np.append(out.real, out.imag, axis = 1)
+        a = np.zeros(out1[0].shape)
+        p = 0
+        for i in range(out2.shape[1]//2):
+            a[:,p] = out2[:,i]
+            a[:,p+1] = out2[:,i+out2.shape[1]//2]
+            p = p+2
+   
+    if len(shape) == 4:
+        out = np.reshape(out,(out.shape[1],out.shape[2],out.shape[3],out.shape[4]))
+        out2 = np.append(out.real, out.imag, axis = 1)
+        a = np.zeros(out1[0].shape)
+        for i in range(out1[0].shape[0]):
+            for j in range(out1[0].shape[1]):
+                p = 0
+                for k in range(out2.shape[3]):
+                    a[i,j,:,p] = out2[i,j,:,k]
+                    a[i,j,:,p+1] = out2[i,j+out1[0].shape[1],:,k]
+                    p = p+2
+    
+    assert_almost_equal(a, out1[0],rtol=1e-3, atol=1e-6)
+    
+    # backward
+    if len(shape) == 2:
+        out_grad = mx.nd.empty((shape[0],2*shape[1]))
+        out_grad[:] = np.random.normal(-3, 3, (shape[0],2*shape[1]))
+        # out_grad_to_complex
+        out_grad_complex = np.zeros(shape,dtype = np.complex64)
+        for i in range(0,shape[1]):
+            out_grad_complex.real[:,i] = out_grad.asnumpy()[:,2*i]
+            out_grad_complex.imag[:,i] = out_grad.asnumpy()[:,2*i+1]
+        for exe in exe_list:
+            exe.backward([out_grad])  
+        a = np.fft.ifft(out_grad_complex, n=None, axis=-1, norm=None)
+        assert_almost_equal(a.real, exe.grad_arrays[0].asnumpy()/shape[1],rtol=1e-3, atol=1e-8)
+         
+    if len(shape) == 4:
+        out_grad = mx.nd.empty(out1[0].shape)
+        out_grad[:] = np.random.normal(-3, 3, out1[0].shape)
+        # out_grad_to_complex
+        out_grad_complex = np.zeros(shape,dtype = np.complex64)
+        for i in range(0,shape[3]):
+            out_grad_complex.real[:,:,:,i] = out_grad.asnumpy()[:,:,:,2*i]
+            out_grad_complex.imag[:,:,:,i] = out_grad.asnumpy()[:,:,:,2*i+1]
+        for exe in exe_list:
+            exe.backward([out_grad])  
+        a = np.fft.ifft(out_grad_complex, n=None, axis=-1, norm=None)
+        assert_almost_equal(a.real, exe.grad_arrays[0].asnumpy()/shape[3],rtol=1e-3, atol=1e-6)
+
+def test_fft():
+    np.random.seed(0)
+    nrepeat = 2
+    maxdim = 10
+    for repeat in range(nrepeat):
+        for order in [2,4]:
+            shape = tuple(np.random.randint(1, maxdim, size=order))
+            check_fft(shape)
+
 def test_batchnorm_with_type():
     sym = mx.sym.BatchNorm(name='norm', fix_gamma=False)
     ctx_list = [{'ctx': mx.gpu(0), 'norm_data': (10, 2, 10, 10), 'type_dict': {'norm_data': np.float32}},
@@ -48,26 +268,80 @@ def test_convolution_with_type():
                 ]
     check_consistency(sym, ctx_list)
 
+
 def test_convolution_options():
+    # 1D convolution
+    ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float64}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float16}},
+                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float64}},
+                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7), 'type_dict': {'conv_data': np.float32}}]
+    sym = mx.sym.Convolution(num_filter=3, kernel=(3,), pad=(1,), name='conv')
+    check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(3,), stride=(2,), name='conv')
+    check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), name='conv')
+    check_consistency(sym, ctx_list)
+
+    # 2D convolution
     ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float64}},
                 {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
                 {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float16}},
                 {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float64}},
                 {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}}]
-
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
     check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), cudnn_off=True, name='conv')
+    check_consistency(sym, ctx_list)
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), stride=(2,2), name='conv')
     check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), stride=(2,2), cudnn_off=True, name='conv')
+    check_consistency(sym, ctx_list)
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='conv')
     check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), dilate=(2,2), cudnn_off=True, name='conv')
+    check_consistency(sym, ctx_list)
 
-    ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
+    # 3D convolution
+    ctx_list = [{'ctx': mx.cpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
+                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
                 {'ctx': mx.gpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float32}}]
     sym = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), pad=(1,1,1), name='conv')
     check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), pad=(1,1,1), cudnn_off=True, name='conv')
+    check_consistency(sym, ctx_list)
     sym = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), stride=(2,2,2), name='conv')
     check_consistency(sym, ctx_list)
+    sym = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), stride=(2,2,2), cudnn_off=True, name='conv')
+    check_consistency(sym, ctx_list)
+
+
+def test_convolution_versions():
+    # 2D convolution NCHW
+    ctx_list = [{'ctx': mx.cpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}}]
+    conv_v1_cpu = mx.sym.Convolution_v1(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
+    conv_v1_gpu = mx.sym.Convolution_v1(num_filter=3, kernel=(3,3), pad=(1,1), cudnn_off=True, name='conv')
+    conv_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
+    conv_cpu = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
+    conv_gpu = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), cudnn_off=True, name='conv')
+    syms = [conv_v1_cpu, conv_v1_gpu, conv_cudnn, conv_cpu, conv_gpu]
+    check_consistency(syms, ctx_list)
+
+    # 3D convolution NCDHW
+    ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.cpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float32}},
+                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float32}}]
+    conv_cudnn = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), pad=(1,1,1), name='conv')
+    conv_cpu = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), pad=(1,1,1), name='conv')
+    conv_gpu = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), pad=(1,1,1), cudnn_off=True, name='conv')
+    syms = [conv_cudnn, conv_cpu, conv_gpu]
+    check_consistency(syms, ctx_list)
+
 
 def test_pooling_with_type():
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (2, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
@@ -84,6 +358,7 @@ def test_pooling_with_type():
     sym = mx.sym.Pooling(kernel=(300,300), pool_type='max', global_pool=True, name='pool')
     check_consistency(sym, ctx_list)
 
+
 def test_deconvolution_with_type():
     sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='deconv')
     ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (2, 2, 10, 10), 'type_dict': {'deconv_data': np.float64}},
@@ -94,6 +369,7 @@ def test_deconvolution_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
+
 def test_bilinear_sampler_with_type():
     data = mx.sym.Variable('data')
     grid = mx.sym.Variable('grid')
@@ -111,6 +387,7 @@ def test_bilinear_sampler_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
+
 def test_grid_generator_with_type():
     data = mx.sym.Variable('data')
     sym = mx.sym.GridGenerator(data=data, transform_type='affine', target_shape=(20, 20))
@@ -124,6 +401,10 @@ def test_grid_generator_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
+
+# Checking max pooling consistency over the data sets of different float types is problematic
+# as one max value in a float32 data set may not be the max value in a float16 data set.
+# This function will not be called.
 def test_pooling_with_type():
     np.random.seed(1234)
     ctx_list = [{'ctx': mx.gpu(0), 'pool_data': (10, 2, 10, 10), 'type_dict': {'pool_data': np.float64}},
@@ -145,6 +426,164 @@ def test_pooling_with_type():
     sym = mx.sym.Pooling(name='pool', kernel=(3,3), pad=(1,1), pool_type='sum')
     check_consistency(sym, ctx_list)
 
+
+def test_pooling_versions():
+    def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, stride,
+                                     pooling_convention='valid', global_pool=False):
+        ctx_list = []
+        sym_list = []
+        # PoolingV1 cpu
+        if 'pool_v1_cpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                                  pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # PoolingV1 gpu
+        if 'pool_v1_gpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                                  pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # Pooling cpu
+        if 'pool_cpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, name='pool'))
+        # Pooling gpu
+        if 'pool_gpu' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, cudnn_off=True, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=True,
+                                               name='pool'))
+        # CuDNNPooling
+        if 'pool_cudnn' in pool_op_list:
+            ctx_list.append({'ctx': mx.gpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
+            if not global_pool:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                               pooling_convention=pooling_convention, cudnn_off=False, name='pool'))
+            else:
+                sym_list.append(mx.sym.Pooling(kernel=kernel, pool_type=pool_type, global_pool=True, cudnn_off=False,
+                                               name='pool'))
+        check_consistency(sym_list, ctx_list)
+
+    def test_1d_pooling(pool_type):
+        data = (2, 3, 20)
+        kernel = (4,)
+        pad = (0,)
+        stride = (1,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (2,)
+        stride = (2,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0,)
+        stride = (1,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        pad = (2,)
+        stride = (2,)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    def test_2d_pooling(pool_type):
+        data = (2, 3, 20, 20)
+        kernel = (4, 5)
+        pad = (0, 0)
+        stride = (1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
+        pad = (2, 3)
+        stride = (2, 3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0, 0)
+        stride = (1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        # pool_v1 has bugs when pad is not 0, do not test PoolingV1 here
+        pad = (2, 3)
+        stride = (2, 3)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_v1_cpu', 'pool_v1_gpu', 'pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    def test_3d_pooling(pool_type):
+        data = (2, 3, 20, 20, 20)
+        kernel = (4, 5, 3)
+        pad = (0, 0, 0)
+        stride = (1, 1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (2, 3, 3)
+        stride = (2, 3, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='valid', global_pool=False)
+
+        pad = (0, 0, 0)
+        stride = (1, 1, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        pad = (2, 3, 3)
+        stride = (2, 3, 1)
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     pooling_convention='full', global_pool=False)
+
+        test_pooling_versions_helper(pool_op_list=['pool_cpu', 'pool_gpu', 'pool_cudnn'],
+                                     data=data, kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
+                                     global_pool=True)
+
+    test_1d_pooling('max')
+    test_1d_pooling('avg')
+    test_1d_pooling('sum')
+
+    test_2d_pooling('max')
+    test_2d_pooling('avg')
+    test_2d_pooling('sum')
+
+    test_3d_pooling('max')
+    test_3d_pooling('avg')
+    test_3d_pooling('sum')
+
+
 def test_upsampling_with_type():
     sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type='nearest', num_args=1)
     ctx_list = [{'ctx': mx.gpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float64}},
@@ -154,6 +593,7 @@ def test_upsampling_with_type():
                 {'ctx': mx.cpu(0), 'up_arg0': (2, 2, 2, 10), 'type_dict': {'up_arg0': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_upsampling_bilinear_with_type():
     sym = mx.sym.UpSampling(scale=2, num_filter=2, name='up', sample_type='bilinear', num_args=1)
     ctx_list = [{'ctx': mx.gpu(0), 'up_data': (2, 2, 2, 10), 'type_dict': {'up_data': np.float64}},
@@ -163,6 +603,7 @@ def test_upsampling_bilinear_with_type():
                 {'ctx': mx.cpu(0), 'up_data': (2, 2, 2, 10), 'type_dict': {'up_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_concat_with_type():
     sym = mx.sym.Concat(name='concat', num_args=2)
     ctx_list = [{'ctx': mx.gpu(0), 'concat_arg1': (2, 10), 'concat_arg0': (2, 10),
@@ -177,6 +618,7 @@ def test_concat_with_type():
                  'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_elementwisesum_with_type():
     sym = mx.sym.ElementWiseSum(name='ews', num_args=2)
     ctx_list = [{'ctx': mx.gpu(0), 'ews_arg1': (2, 10), 'ews_arg0': (2, 10),
@@ -201,6 +643,7 @@ def test_reshape_with_type():
                 {'ctx': mx.cpu(0), 'reshape_data': (2, 2, 2, 10), 'type_dict': {'reshape_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_blockgrad_with_type():
     sym = mx.sym.BlockGrad(name='bg')
     ctx_list = [{'ctx': mx.gpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float64}},
@@ -210,6 +653,7 @@ def test_blockgrad_with_type():
                 {'ctx': mx.cpu(0), 'bg_data': (2, 2, 2, 10), 'type_dict': {'bg_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_swapaxis_with_type():
     sym = mx.sym.SwapAxis(name='swap', dim1=1)
     ctx_list = [{'ctx': mx.gpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float64}},
@@ -219,6 +663,7 @@ def test_swapaxis_with_type():
                 {'ctx': mx.cpu(0), 'swap_data': (2, 2, 2, 10), 'type_dict': {'swap_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_fullyconnected_with_type():
     sym = mx.sym.FullyConnected(num_hidden=3, name='inner')
     ctx_list = [{'ctx': mx.gpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float64}},
@@ -228,6 +673,7 @@ def test_fullyconnected_with_type():
                 {'ctx': mx.cpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float32}}]
     check_consistency(sym, ctx_list)
 
+
 def test_activation_with_type():
     sym = mx.sym.Activation(name='act', act_type='sigmoid')
     ctx_list = [{'ctx': mx.gpu(0), 'act_data': (2, 2, 10, 10), 'type_dict': {'act_data': np.float64}},
@@ -238,6 +684,7 @@ def test_activation_with_type():
                 {'ctx': mx.cpu(0), 'act_data': (2, 2, 10, 10), 'type_dict': {'act_data': np.float16}}]
     check_consistency(sym, ctx_list)
 
+
 def test_embedding_with_type():
     sym = mx.sym.Embedding(name='embedding', input_dim=10, output_dim=20)
     ctx_list = [{'ctx': mx.gpu(0), 'embedding_data': (2, 10), 'type_dict': {'embedding_data': np.float64}},
@@ -250,6 +697,18 @@ def test_embedding_with_type():
     check_consistency(sym, ctx_list, grad_req={'embedding_data': 'null','embedding_weight': 'write'},
                       arg_params=arg_params)
 
+
+def test_svmoutput_with_type():
+    sym = mx.sym.SVMOutput(name='svmoutput', use_linear=True)
+    ctx_list = [{'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}},
+                {'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float32}},
+                {'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float16}},
+                {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}},
+                {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float32}},
+                {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float16}}]
+    check_consistency(sym, ctx_list)
+
+
 def test_take_with_type():
     sym = mx.sym.take(name='take')
     for data_ndim in range(2, 5):
@@ -259,40 +718,41 @@ def test_take_with_type():
                 data_shape += (np.random.randint(low=3, high=6), )
             idx_shape = ()
             for _ in range(idx_ndim):
-                idx_shape += (np.random.randint(low=3, high=5), ) 
-            ctx_list = [{'ctx': mx.gpu(0), 'take_indices': idx_shape, 
-                         'take_a': data_shape, 
-                         'type_dict': {'take_indices': np.float64, 
+                idx_shape += (np.random.randint(low=3, high=5), )
+            ctx_list = [{'ctx': mx.gpu(0), 'take_indices': idx_shape,
+                         'take_a': data_shape,
+                         'type_dict': {'take_indices': np.float64,
                                        'take_a': np.float64}},
-                        {'ctx': mx.gpu(0), 'take_indices': idx_shape, 
-                         'take_a': data_shape, 
-                         'type_dict': {'take_indices': np.float32, 
+                        {'ctx': mx.gpu(0), 'take_indices': idx_shape,
+                         'take_a': data_shape,
+                         'type_dict': {'take_indices': np.float32,
                                        'take_a': np.float32}},
-                        {'ctx': mx.gpu(0), 'take_indices': idx_shape, 
-                         'take_a': data_shape, 
-                         'type_dict': {'take_indices': np.float16, 
+                        {'ctx': mx.gpu(0), 'take_indices': idx_shape,
+                         'take_a': data_shape,
+                         'type_dict': {'take_indices': np.float16,
                                        'take_a': np.float16}},
-                        {'ctx': mx.cpu(0), 'take_indices': idx_shape, 
-                         'take_a': data_shape, 
-                         'type_dict': {'take_indices': np.float64, 
+                        {'ctx': mx.cpu(0), 'take_indices': idx_shape,
+                         'take_a': data_shape,
+                         'type_dict': {'take_indices': np.float64,
                                        'take_a': np.float64}},
-                        {'ctx': mx.cpu(0), 'take_indices': idx_shape, 
-                         'take_a': data_shape, 
-                         'type_dict': {'take_indices': np.float32, 
+                        {'ctx': mx.cpu(0), 'take_indices': idx_shape,
+                         'take_a': data_shape,
+                         'type_dict': {'take_indices': np.float32,
                                        'take_a': np.float32}},
-                        {'ctx': mx.cpu(0), 'take_indices': idx_shape, 
-                         'take_a': data_shape, 
-                         'type_dict': {'take_indices': np.float16, 
+                        {'ctx': mx.cpu(0), 'take_indices': idx_shape,
+                         'take_a': data_shape,
+                         'type_dict': {'take_indices': np.float16,
                                        'take_a': np.float16}}]
-            arg_params = {'take_indices': np.random.randint(low=0, 
-                                                            high=data_shape[0], 
-                                                            size=idx_shape), 
+            arg_params = {'take_indices': np.random.randint(low=0,
+                                                            high=data_shape[0],
+                                                            size=idx_shape),
                           'take_a': np.random.normal(size=data_shape)}
-            check_consistency(sym, ctx_list, 
+            check_consistency(sym, ctx_list,
                               grad_req={'take_indices': 'null',
                                         'take_a': 'write'},
                               arg_params=arg_params)
 
+
 def check_rnn_consistency(cell1, cell2):
     dshape = (32, 5, 200)
     data = mx.sym.Variable('data')
@@ -312,8 +772,8 @@ def check_rnn_consistency(cell1, cell2):
     mod2.set_params(args, auxs)
 
     batch=mx.io.DataBatch(data=[mx.random.uniform(shape=dshape)], label=[])
-    mod1.forward(batch)
-    mod2.forward(batch)
+    mod1.forward(batch, is_train=False)
+    mod2.forward(batch, is_train=False)
 
     assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=1e-2, atol=1e-4)
 
@@ -340,12 +800,82 @@ def test_lstm():
     check_rnn_consistency(stack, fused)
 
 
+def test_lstm_forget_bias():
+    forget_bias = 2.0
+    fused = mx.rnn.FusedRNNCell(10, forget_bias=forget_bias, num_layers=2, mode='lstm', prefix='')
+
+    dshape = (32, 1, 20)
+    data = mx.sym.Variable('data')
+
+    sym, _ = fused.unroll(1, data, merge_outputs=True)
+    mod = mx.mod.Module(sym, label_names=None, context=mx.gpu(0))
+    mod.bind(data_shapes=[('data', dshape)], label_shapes=None)
+
+    mod.init_params()
+
+    args, auxs = mod.get_params()
+    args = fused.unpack_weights(args)
+
+    bias_name = next(x for x in args if x.endswith('f_bias'))
+    expected_bias = forget_bias * np.ones(10, )
+    assert_allclose(args[bias_name].asnumpy(), expected_bias)
+
+
+def test_gru():
+    fused = mx.rnn.FusedRNNCell(100, num_layers=2, mode='gru', prefix='')
+
+    stack = mx.rnn.SequentialRNNCell()
+    stack.add(mx.rnn.GRUCell(100, prefix='l0_'))
+    stack.add(mx.rnn.GRUCell(100, prefix='l1_'))
+
+    check_rnn_consistency(fused, stack)
+    check_rnn_consistency(stack, fused)
+
+
+def test_bidirectional():
+    fused = mx.rnn.FusedRNNCell(100, num_layers=2, mode='gru', prefix='',
+            bidirectional=True)
+
+    stack = mx.rnn.SequentialRNNCell()
+    stack.add(mx.rnn.BidirectionalCell(
+                mx.rnn.GRUCell(100, prefix='l0_'),
+                mx.rnn.GRUCell(100, prefix='r0_'),
+                output_prefix='bi_gru_0_'))
+    stack.add(mx.rnn.BidirectionalCell(
+                mx.rnn.GRUCell(100, prefix='l1_'),
+                mx.rnn.GRUCell(100, prefix='r1_'),
+                output_prefix='bi_gru_1_'))
+
+    check_rnn_consistency(fused, stack)
+    check_rnn_consistency(stack, fused)
+
+def test_unfuse():
+    for mode in ['rnn_tanh', 'rnn_relu', 'lstm', 'gru']:
+        fused = mx.rnn.FusedRNNCell(
+            100, num_layers=2, mode=mode,
+            prefix='test_%s'%mode,
+            bidirectional=True,
+            dropout=0.5)
+
+        stack = fused.unfuse()
+
+        check_rnn_consistency(fused, stack)
+        check_rnn_consistency(stack, fused)
+
 if __name__ == '__main__':
+    test_countsketch()
+    test_ifft()
+    test_fft()
+    test_bidirectional()
     test_lstm()
+    test_lstm_forget_bias()
+    test_gru()
     test_rnn()
+    test_unfuse()
     test_convolution_options()
+    test_convolution_versions()
     test_convolution_with_type()
-    test_pooling_with_type()
+    test_pooling_versions()
     test_batchnorm_with_type()
     test_batchnorm_with_type()
     test_deconvolution_with_type()
@@ -358,6 +888,7 @@ def test_lstm():
     test_fullyconnected_with_type()
     test_activation_with_type()
     test_embedding_with_type()
+    test_svmoutput_with_type()
     test_take_with_type()
     test_bilinear_sampler_with_type()
     test_grid_generator_with_type()
diff --git a/tests/python/unittest/test_executor.py b/tests/python/unittest/test_executor.py
index 1f5b2b0e2f7e..ab1e15c909cd 100644
--- a/tests/python/unittest/test_executor.py
+++ b/tests/python/unittest/test_executor.py
@@ -1,5 +1,6 @@
 import numpy as np
 import mxnet as mx
+import os
 
 
 def reldiff(a, b):
@@ -60,7 +61,13 @@ def check_bind_with_uniform(uf, gf, dim, sf=None, lshape=None, rshape=None):
     assert reldiff(rhs_grad.asnumpy(), rhs_grad2) < 1e-6
 
 
-def test_bind():
+def test_bind(disable_bulk_exec=False):
+    if disable_bulk_exec:
+        prev_fwd_var = os.environ.get("MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN", "1")
+        prev_bwd_var = os.environ.get("MXNET_EXEC_BULK_BWD_TRAIN", "1")
+        os.environ["MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN"] = "0"
+        os.environ["MXNET_EXEC_BULK_BWD_TRAIN"] = "0"
+
     np.random.seed(0)
     nrepeat = 10
     maxdim = 4
@@ -87,6 +94,9 @@ def test_bind():
                                     lambda g, x, y: (g * (x<y), g * (y<x)),
                                     dim,
                                     sf=mx.symbol.minimum)
+    if disable_bulk_exec:
+       os.environ["MXNET_EXEC_BULK_FWD_THRESHOLD_TRAIN"] = prev_fwd_var
+       os.environ["MXNET_EXEC_BULK_BWD_TRAIN"] = prev_bwd_var
 
 def test_dot():
     np.random.seed(0)
@@ -130,5 +140,6 @@ def test_reshape():
     assert np.all(exe.outputs[0].asnumpy() == 4)
 
 if __name__ == "__main__":
-    test_bind()
+    test_bind(disable_bulk_exec=False)
+    test_bind(disable_bulk_exec=True)
     test_reshape()
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index 184c783e82c0..35598bc55be8 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -44,7 +44,80 @@ def test_backward_infer():
     for k, v in true_shapes.items():
         assert arg_shape_dict[k] == v
 
+
+def test_incomplete_infer_elewise():
+    a = mx.sym.Variable('a', shape=(0, 10))
+    b = mx.sym.Variable('b', shape=(12, 0))
+    c = a + b
+    arg_shapes, _, _ = c.infer_shape()
+    arg_names = c.list_arguments()
+    arg_shapes = {k: v for k, v in zip(arg_names, arg_shapes)}
+    assert arg_shapes['a'] == (12, 10)
+    assert arg_shapes['b'] == (12, 10)
+
+
+def test_incomplete_infer_mlp():
+    a = mx.sym.Variable('a', shape=(0, 10))
+    b = mx.sym.FullyConnected(data=a, num_hidden=21)
+    c = mx.sym.Variable('c', shape=(5, 0))
+    d = b + c
+    arg_shapes, _, _ = d.infer_shape()
+    arg_names = d.list_arguments()
+    arg_shapes = {k: v for k, v in zip(arg_names, arg_shapes)}
+    assert arg_shapes['a'] == (5, 10)
+    assert arg_shapes['c'] == (5, 21)
+
+
+def test_incomplete_infer_slicechannel():
+    a = mx.sym.Variable('a', shape=(0, 10))
+    b = mx.sym.SliceChannel(data=a, num_outputs=10, axis=1, squeeze_axis=True)
+    c = mx.sym.Variable('c', shape=(5,))
+    d = b[1] + c
+    arg_shapes, _, _ = d.infer_shape()
+    arg_names = d.list_arguments()
+    arg_shapes = {k: v for k, v in zip(arg_names, arg_shapes)}
+    assert arg_shapes['a'] == (5, 10)
+
+    a = mx.sym.Variable('a', shape=(0, 15, 0))
+    b = mx.sym.SliceChannel(data=a, num_outputs=3, squeeze_axis=False)
+    c = mx.sym.Variable('c', shape=(3, 5, 2))
+    d = b[1] + c
+    arg_shapes, _, _ = d.infer_shape()
+    arg_names = d.list_arguments()
+    arg_shapes = {k: v for k, v in zip(arg_names, arg_shapes)}
+    assert arg_shapes['a'] == (3, 15, 2)
+
+
+def test_incomplete_infer_convolution():
+    a = mx.sym.Variable('a', shape=(0, 10, 0, 0))
+    b = mx.sym.Convolution(data=a, num_filter=21, kernel=(3, 3), dilate=(1, 1), pad=(1, 1))
+    c = mx.sym.Variable('c', shape=(5, 21, 32, 32))
+    d = b + c
+    arg_shapes, _, _ = d.infer_shape()
+    arg_names = d.list_arguments()
+    arg_shapes = {k: v for k, v in zip(arg_names, arg_shapes)}
+    assert arg_shapes['a'] == (5, 10, 32, 32)
+
+
+def test_incomplete_infer_concat():
+    a = mx.sym.Variable('a', shape=(0, 10))
+    b = mx.sym.Variable('b', shape=(0, 5))
+    c = mx.sym.Concat(a, b, num_args=2, dim=1)
+    d = mx.sym.Variable('d', shape=(2, 0))
+    d = d + c
+    arg_shapes, _, _ = d.infer_shape()
+    arg_names = d.list_arguments()
+    arg_shapes = {k: v for k, v in zip(arg_names, arg_shapes)}
+    assert arg_shapes['a'] == (2, 10)
+    assert arg_shapes['b'] == (2, 5)
+    assert arg_shapes['d'] == (2, 15)
+
 if __name__ == "__main__":
     test_mlp2_infer_shape()
     test_mlp2_infer_error()
     test_backward_infer()
+    test_incomplete_infer_elewise()
+    test_incomplete_infer_mlp()
+    test_incomplete_infer_slicechannel()
+    test_incomplete_infer_convolution()
+    test_incomplete_infer_concat()
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 3f55cde774e8..033c2e0d10e5 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -82,8 +82,89 @@ def test_module_reshape():
     assert mod.get_outputs()[0].shape == dshape
     assert (mod.get_params()[0]['fc_bias'].asnumpy() == -3).all()
 
+def test_module_states():
+    stack = mx.rnn.SequentialRNNCell()
+    for i in range(2):
+        stack.add(mx.rnn.LSTMCell(num_hidden=20, prefix='lstm_l%d_'%i))
+    begin_state = stack.begin_state(func=mx.sym.Variable)
+    _, states = stack.unroll(10, begin_state=begin_state, inputs=mx.sym.Variable('data'))
+
+    state_names = [i.name for i in begin_state]
+    mod = mx.mod.Module(mx.sym.Group(states), context=[mx.cpu(0), mx.cpu(1)],
+                        label_names=None, state_names=state_names)
+    mod.bind(data_shapes=[('data', (5, 10))], label_shapes=None, for_training=False)
+    mod.init_params()
+    batch = mx.io.DataBatch(data=[mx.nd.zeros((5, 10))], label=[])
+
+    mod.set_states(value=1)
+    mod.forward(batch)
+    out = mod.get_outputs(merge_multi_context=False)
+    out1 = mod.get_outputs(merge_multi_context=True)
+
+    mod.set_states(states=out)
+    mod.forward(batch)
+    out2 = mod.get_outputs(merge_multi_context=True)
+
+    for x1, x2 in zip(out1, out2):
+        assert not mx.test_utils.almost_equal(x1.asnumpy(), x2.asnumpy(), rtol=1e-3)
+
+def test_module_switch_bucket():
+    vocab_dim = 5000
+    num_hidden = 100
+    num_embedding = 100
+    num_layer = 2
+    default_key = 10
+    test_key = 5
+    batch_size = 32
+    contexts = [mx.cpu(0)]
+    initializer = mx.init.Xavier(factor_type="in", magnitude=2.34)
+
+    #generate symbols for an LSTM network
+    def sym_gen(seq_len):
+        data = mx.sym.Variable('data')
+        label = mx.sym.Variable('softmax_label')
+        embed = mx.sym.Embedding(data=data, input_dim=vocab_dim,
+                                 output_dim=num_embedding, name='embed')
+        stack = mx.rnn.SequentialRNNCell()
+        for i in range(num_layer):
+            stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_'%i))
+        outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
+
+        pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden))
+        pred = mx.sym.FullyConnected(data=pred, num_hidden=vocab_dim, name='pred')
+
+        label = mx.sym.Reshape(label, shape=(-1,))
+        pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
+
+        return pred, ('data',), ('softmax_label',)
+
+    def create_bucketing_module(key):
+        model = mx.mod.BucketingModule(
+            sym_gen             = sym_gen,
+            default_bucket_key  = key,
+            context             = contexts)
+        model.bind([('data', (batch_size, key))],
+                    [('softmax_label', (batch_size, key))], True, False)
+        model.init_params(initializer=initializer)
+        return model
+    #initialize the bucketing module with the default bucket key
+    bucketing_model = create_bucketing_module(default_key)
+    #switch to test_key
+    bucketing_model.switch_bucket(test_key, [('data', (batch_size, test_key))],
+                                  [('softmax_label', (batch_size, test_key))])
+    total_bytes_before = bucketing_model._buckets[default_key]._total_exec_bytes
+
+    #remove test_key and switch again
+    del bucketing_model._buckets[test_key]
+    bucketing_model.switch_bucket(test_key, [('data', (batch_size, test_key))],
+                                  [('softmax_label', (batch_size, test_key))])
+    total_bytes_after = bucketing_model._buckets[default_key]._total_exec_bytes
+    #the default bucket is expected to reuse the bytes allocated
+    assert total_bytes_after == total_bytes_before
 
 if __name__ == '__main__':
+    test_module_states()
     test_module_reshape()
     test_save_load()
     test_module_layout()
+    test_module_switch_bucket()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 80c5c2e2646b..e5cf7d7ec7ba 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -116,6 +116,20 @@ def test_ndarray_negate():
     # we compute (-arr)
     assert_almost_equal(npy, arr.asnumpy())
 
+def test_ndarray_reshape():
+    tensor  = mx.nd.array([[[1, 2], [3, 4]],
+                           [[5, 6], [7, 8]]])
+    true_res = mx.nd.arange(8) + 1
+    assert same(tensor.reshape((-1, )), true_res)
+    true_res  = mx.nd.array([[1, 2, 3, 4],
+                             [5, 6, 7, 8]])
+    assert same(tensor.reshape((2, -1)), true_res)
+    true_res  = mx.nd.array([[1, 2],
+                             [3, 4],
+                             [5, 6],
+                             [7, 8]])
+    assert same(tensor.reshape((-1, 2)), true_res)
+
 
 def test_ndarray_choose():
     shape = (100, 20)
@@ -400,6 +414,19 @@ def check_broadcast_binary(fn):
     check_broadcast_binary(lambda x, y: x <= y)
     check_broadcast_binary(lambda x, y: x == y)
 
+def test_moveaxis():
+    X = mx.nd.array([[[1, 2, 3], [4, 5, 6]],
+                     [[7, 8, 9], [10, 11, 12]]])
+    res = mx.nd.moveaxis(X, 0, 3).asnumpy()
+    true_res = mx.nd.array([[[  1.,   7.],
+                             [  2.,   8.],
+                             [  3.,   9.]],
+                            [[  4.,  10.],
+                             [  5.,  11.],
+                             [  6.,  12.]]])
+    assert same(res, true_res.asnumpy())
+    assert mx.nd.moveaxis(X, 2, 0).shape == (3, 2, 2)
+
 def test_arange():
     for i in range(5):
         start = np.random.rand() * 10
@@ -579,6 +606,16 @@ def test_take():
             result = mx.nd.take(data_real_mx, idx_real_mx)
             assert_almost_equal(result.asnumpy(), data_real[idx_real])
 
+
+def test_iter():
+    x = mx.nd.array([1, 2, 3])
+    y = []
+    for a in x:
+        y.append(a)
+
+    assert np.all(np.array(y) == x.asnumpy())
+
+
 if __name__ == '__main__':
     test_broadcast_binary()
     test_ndarray_setitem()
@@ -603,3 +640,4 @@ def test_take():
     test_order()
     test_ndarray_equal()
     test_take()
+    test_iter()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
old mode 100644
new mode 100755
index d94cf9a00efd..d7715933f776
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -5,13 +5,13 @@
 from numpy.testing import assert_allclose
 from mxnet.test_utils import *
 
-def np_softmax(x):
+def np_softmax(x, axis=-1):
     # fix for old numpy on Travis not supporting keepdims
     # x = x - np.max(x, axis=-1, keepdims=True)
-    x = x - np.max(x, axis=-1).reshape(x.shape[:-1] + (1,))
+    x = x - np.max(x, axis=axis, keepdims=True)
     x = np.exp(x)
     # x /= np.sum(x, axis=-1, keepdims=True)
-    x /= np.sum(x, axis=-1).reshape(x.shape[:-1] + (1,))
+    x /= np.sum(x, axis=axis, keepdims=True)
     return x
 
 
@@ -27,7 +27,7 @@ def check_elementwise_sum_with_shape(shape, n):
                      args=arr,
                      args_grad=arr_grad)
     out1 = exec1.outputs[0].asnumpy()
-    exec1.forward()
+    exec1.forward(is_train=True)
     out1 = exec1.outputs[0].asnumpy()
     out = sum(a.asnumpy() for a  in arr)
     assert_almost_equal(out, out1)
@@ -78,7 +78,7 @@ def check_concat_with_shape(shapes, dimension, skip_second):
     exec1 = out.bind(default_context(),
                      args=arr,
                      args_grad=dict_grad)
-    exec1.forward()
+    exec1.forward(is_train=True)
     out1 = exec1.outputs[0]
     ret = np.concatenate([narray.asnumpy() for narray in arr], axis=dimension)
     assert_almost_equal(out1.asnumpy(), ret)
@@ -193,7 +193,7 @@ def check_regression(symbol, forward, backward):
     exec1 = out.bind(default_context(),
                      args=[arr_data, arr_label],
                      args_grad={"data" : arr_grad})
-    exec1.forward()
+    exec1.forward(is_train=True)
     out1 = exec1.outputs[0].asnumpy()
     npout = forward(arr_data.asnumpy())
     assert_almost_equal(npout, out1)
@@ -226,7 +226,7 @@ def check_softmax_with_ignore_label(xpu):
     grad = mx.nd.empty(shape, ctx = xpu)
 
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward()
+    exec1.forward(is_train=True)
     exec1.backward()
 
     grad0 = grad.asnumpy()
@@ -235,7 +235,7 @@ def check_softmax_with_ignore_label(xpu):
         l_np[i] = 0
     l[:] = l_np
 
-    exec1.forward()
+    exec1.forward(is_train=True)
     exec1.backward()
     grad1 = grad.asnumpy()
 
@@ -252,7 +252,7 @@ def check_softmax_with_shape(shape, xpu, preserve_shape=False):
     l[:] = np_softmax(l.asnumpy())
     grad = mx.nd.empty(shape, ctx = xpu)
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward()
+    exec1.forward(is_train=True)
     out = exec1.outputs[0].asnumpy()
     assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=1e-4)
     exec1.backward()
@@ -272,7 +272,7 @@ def test_python_op():
     dx = mx.ndarray.zeros((10))
     dy = mx.ndarray.ones((10))
     exec1 = s.bind(default_context(), args=[x], args_grad = {'X': dx})
-    exec1.forward()
+    exec1.forward(is_train=True)
     assert_almost_equal(x.asnumpy(), exec1.outputs[0].asnumpy())
     exec1.backward(dy)
     assert_almost_equal(dy.asnumpy(), dx.asnumpy())
@@ -287,7 +287,7 @@ def test_swapaxes():
     swap0 = mx.symbol.SwapAxis(data=data, dim1=0, dim2=2)
     swap = mx.symbol.SwapAxis(data=swap0, dim1=1, dim2=2)
     exe_c = swap.bind(default_context(), args=[arr_data])
-    exe_c.forward()
+    exe_c.forward(is_train=True)
     out = exe_c.outputs[0].asnumpy()
 
     swap0_ = np.swapaxes(data_tmp, 0, 2)
@@ -424,7 +424,7 @@ def test_embedding():
     # forward
     arg_map["data"][:] = np_data
     arg_map["embed_weight"][:] = np_weight
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, np_weight))
     # backward
     np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
@@ -446,7 +446,7 @@ def test_binary_op_duplicate_input():
     out_grad[:] = 1
     square = data * data
     exe_square = square.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
-    exe_square.forward()
+    exe_square.forward(is_train=True)
     assert_almost_equal(exe_square.outputs[0].asnumpy(), data_tmp * data_tmp)
     exe_square.backward(out_grad)
     assert_almost_equal(arr_grad.asnumpy(), 2.0 * data_tmp)
@@ -462,7 +462,7 @@ def test_sign():
 
     test = mx.sym.sign(data)
     exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout = np.sign(data_tmp)
     assert_almost_equal(out, npout)
@@ -485,7 +485,7 @@ def test_round_ceil_floor():
 
     test = mx.sym.round(data) + mx.sym.ceil(data) +  mx.sym.floor(data)
     exe_test = test.bind(default_context(), args=[arr_data])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp)
     assert_almost_equal(out, npout)
@@ -501,7 +501,7 @@ def test_rsqrt_cos_sin():
 
     test =  mx.sym.rsqrt(data) + mx.sym.cos(data) + mx.sym.sin(data)
     exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout =  1/ np.sqrt(data_tmp) + np.cos(data_tmp) + np.sin(data_tmp)
     assert_almost_equal(out, npout)
@@ -532,7 +532,7 @@ def test_maximum_minimum():
 
     test =  mx.sym.maximum(data1,data2) + mx.sym.minimum(data1,data2);
     exe_test = test.bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout =  np.maximum(data_tmp1,data_tmp2) + np.minimum(data_tmp1,data_tmp2)
     assert_almost_equal(out, npout)
@@ -562,7 +562,7 @@ def test_maximum_minimum_scalar():
 
     test =  mx.sym.maximum(data1,3) + mx.sym.maximum(9,data1) + mx.sym.minimum(5,data1) + mx.sym.minimum(data1,4)
     exe_test = test.bind(default_context(), args=[arr_data1], args_grad=[arr_grad1])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout =  np.maximum(data_tmp1,3) + np.maximum(9,data_tmp1) + np.minimum(5,data_tmp1) + np.minimum(data_tmp1,4)
     assert_almost_equal(out, npout)
@@ -592,7 +592,7 @@ def test_abs():
 
     test = mx.sym.abs(data)
     exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout = abs(data_tmp)
     assert_almost_equal(out, npout)
@@ -632,7 +632,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
     args_grad = [mx.nd.empty(s) for s in arg_shapes]
 
     exe = deconv.bind(default_context(), args=args, args_grad=args_grad)
-    exe.forward()
+    exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(out_grad)
     assert_almost_equal(out, args_grad[0].asnumpy(), rtol=1E-3, atol=1e-4)
@@ -640,7 +640,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
     args_grad_addto_npy = [np.random.normal(size=s) for s in arg_shapes]
     args_grad_addto = [mx.nd.array(ele) for ele in args_grad_addto_npy]
     exe = deconv.bind(default_context(), args=args, args_grad=args_grad_addto, grad_req="add")
-    exe.forward()
+    exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(out_grad)
     assert_almost_equal(out + args_grad_addto_npy[0], args_grad_addto[0].asnumpy(), rtol=1e-4, atol=1e-4)
@@ -691,7 +691,7 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
     exe_deconv.forward(is_train=True)
     deconv_out_grad = conv_data[:]
     exe_deconv.backward(deconv_out_grad)
-    assert_almost_equal(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy(), rtol=1e-3)
+    assert_almost_equal(conv_args_grad[1].asnumpy(), deconv_args_grad[1].asnumpy(), rtol=1e-3, atol=1e-2)
     # Test AddTo
     exe_deconv_addto = deconv.bind(default_context(), args=deconv_args,
                                    args_grad=deconv_addto_args_grad,
@@ -704,9 +704,13 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
 
 def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, target_shape=None):
     data = mx.sym.Variable(name="data")
-    deconv = mx.sym.Deconvolution(
-        data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5,
-        target_shape = target_shape if target_shape is not None else (0, 0))
+    if target_shape:
+        deconv = mx.sym.Deconvolution(
+            data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5,
+            target_shape = target_shape)
+    else:
+        deconv = mx.sym.Deconvolution(
+            data=data, kernel=kernel, stride=stride, pad=pad, adj=adj, num_filter=5)
     arg_names = deconv.list_arguments()
     arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape)
     assert out_shapes[0] == (input_shape[0], 5, 8, 8)
@@ -838,27 +842,68 @@ def test_convolution_grouping():
     exe2.backward(exe2.outputs[0])
 
     for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
-        np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3)
-
-def gen_broadcast_data():
-    # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
-    ndim = np.random.randint(1, 6)
-    shape = np.random.randint(1, 6, size=(ndim,))
-    l_same_dim = np.random.randint(0, 5)
-    r_same_dim = np.random.randint(0, 5)
-    l_axis_flags = np.random.randint(0, 2, size=ndim)
-    r_axis_flags = np.random.randint(0, 2, size=ndim)
-    if l_same_dim == 4:
-        l_axis_flags = np.ones(ndim)
-    if r_same_dim == 4:
-        r_axis_flags = np.ones(ndim)
-    l_shape = shape.copy()
-    r_shape = shape.copy()
-    l_shape[np.where(l_axis_flags == 0)] = 1
-    r_shape[np.where(r_axis_flags == 0)] = 1
+        np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
+
+def gen_broadcast_data(idx):
+    # Manually set test cases
+    binary_op_data_shape = np.array(
+        [[[2, 5, 1, 30, 7], [1, 5, 448, 30, 1]],
+        [[10, 49, 1, 77, 17], [10, 1, 2, 1, 17]],
+        [[13, 2, 65, 2,  1], [13, 1, 65, 1, 225]],
+        [[9, 434, 4, 2, 37], [9, 1, 4, 1, 37]],
+        [[2, 52, 1, 4, 1], [1, 52, 60, 1, 37]],
+        [[1, 23, 7, 122, 50], [2, 1, 7, 1, 50]],
+        [[1, 17, 1, 5, 1], [22, 1, 2, 1, 28]],
+        [[29, 1, 2, 1, 8], [29, 22, 1, 130, 1]],
+        [[2, 36, 1, 427, 3], [1, 36, 11, 427, 1]],
+        [[1, 2, 1, 100, 7], [1, 2, 448, 100, 1]],
+        [[1, 2, 495, 77, 7], [1, 2, 1, 1, 7]],
+        [[1, 43, 65, 2, 1], [1, 43, 65, 1, 225]],
+        [[1, 92, 434, 2, 2], [1, 92, 1, 2, 2]],
+        [[1, 92, 1, 4, 1], [1, 92, 134, 1, 17]],
+        [[1, 53, 2, 122, 143], [1, 1, 2, 1, 143]],
+        [[1, 179, 1, 87, 17], [1, 179, 1, 1, 17]],
+        [[1, 1, 17, 5, 1], [1, 22, 1, 1, 28]],
+        [[1, 2, 1, 1, 8], [1, 2, 52, 430, 1]],
+        [[1, 163, 1, 22, 3], [1, 163, 116, 22, 1]],
+        [[1, 1, 44, 30, 7], [1, 1, 44, 30, 1]],
+        [[1, 1, 1, 1, 28], [1, 127, 1, 5, 28]],
+        [[1, 2, 394, 38, 1], [1, 2, 394, 38, 16]],
+        [[1, 10, 49, 77, 17], [1, 1, 1, 1, 17]],
+        [[1, 431, 6, 2, 225], [1, 1, 6, 2, 225]],
+        [[1, 15, 1, 28, 1], [1, 15, 1, 28, 463]],
+        [[1, 129, 2, 48, 96], [1, 129, 2, 1, 1]],
+        [[1, 1, 403, 17, 2], [1, 44, 403, 17, 2]],
+        [[1, 1, 65, 2, 22], [1, 1, 65, 1, 1]],
+        [[1, 24, 103, 17, 18], [1, 24, 1, 1, 1]],
+        [[1, 1, 1, 1, 2], [1, 24, 194, 50, 1]],
+        [[1, 1, 107, 84, 9], [1, 1, 1, 1, 1]]])
+    if idx < binary_op_data_shape.shape[0]:
+        l_shape = binary_op_data_shape[idx][0]
+        r_shape = binary_op_data_shape[idx][1]
+    else:
+        # Generate random data that has ndim between 1-7 and all the shape dims between 1-5
+        ndim = np.random.randint(1, 6)
+        shape = np.random.randint(1, 6, size=(ndim,))
+        l_same_dim = np.random.randint(0, 5)
+        r_same_dim = np.random.randint(0, 5)
+        l_axis_flags = np.random.randint(0, 2, size=ndim)
+        r_axis_flags = np.random.randint(0, 2, size=ndim)
+        if l_same_dim == 4:
+            l_axis_flags = np.ones(ndim)
+        if r_same_dim == 4:
+            r_axis_flags = np.ones(ndim)
+        l_shape = shape.copy()
+        r_shape = shape.copy()
+        l_shape[np.where(l_axis_flags == 0)] = 1
+        r_shape[np.where(r_axis_flags == 0)] = 1
     return [np.random.random(l_shape), np.random.random(r_shape)]
 
-def gen_binary_data():
+def gen_broadcast_data_int(idx):
+    d = gen_broadcast_data(idx);
+    return [np.round(d[0]*100), np.round(d[1]*100)]
+
+def gen_binary_data(dummy):
     ndim = np.random.randint(1, 6)
     shape = np.random.randint(1, 6, size=(ndim,))
     return [np.random.random(shape), np.random.random(shape)]
@@ -866,16 +911,16 @@ def gen_binary_data():
 def check_binary_op_forward(symbol, baseline, gen_data):
     sample_num = 200
     for i in range(sample_num):
-        d = gen_data()
+        d = gen_data(i)
         x = baseline(d[0], d[1])
         y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])})
-        y.forward()
+        y.forward(is_train=True)
         assert_allclose(x, y.outputs[0].asnumpy(), rtol=1e-3, atol=1e-5)
 
 def check_binary_op_backward(symbol, baseline, gen_data):
     sample_num = 200
     for i in range(sample_num):
-        d = gen_data()
+        d = gen_data(i)
         out = np.random.random((d[0] + d[1]).shape)
         def reduce_op(shape, x):
             if shape == x.shape:
@@ -893,7 +938,7 @@ def reduce_op(shape, x):
         y_2 = mx.nd.empty(d[1].shape)
         y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])},
                         args_grad=[y_1, y_2])
-        y.forward()
+        y.forward(is_train=True)
         y.backward([mx.nd.array(out)])
         assert_allclose(x_1, y_1.asnumpy(), rtol=1e-3, atol=1e-5)
         assert_allclose(x_2, y_2.asnumpy(), rtol=1e-3, atol=1e-5)
@@ -972,9 +1017,8 @@ def test_bpow(a, b):
 
     def test_bequal(a, b):
         c = mx.sym.broadcast_equal(a, b)
-        check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data)
-        check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data)
-
+        check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data_int)
+        check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int)
 
     test_bplus(a, b)
     test_bminus(a, b)
@@ -1053,6 +1097,7 @@ def test_convolution_dilated_impulse_response():
         for ks in [ (3,3), (4,4), (2,3), (3,2), (1,1) ]:
             test_run_convolution_dilated_impulse_response(dil=dil, kernel_shape=ks)
 
+
 def test_reshape():
 
     def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
@@ -1269,9 +1314,17 @@ def test_crop():
             end = []
             idx = []
             for i in range(ndim):
-                d = random.randint(1, 10)
+                d = random.randint(1, 5)
                 b = random.randint(0, d-1)
                 e = random.randint(b+1, d)
+                if b == 0 and random.randint(0, 1):
+                    b = None
+                elif b != 0 and random.randint(0, 1):
+                    b -= d
+                if e == d and random.randint(0, 1):
+                    e = None
+                elif e != d and random.randint(0, 1):
+                    e -= d
                 dims.append(d)
                 begin.append(b)
                 end.append(e)
@@ -1280,6 +1333,10 @@ def test_crop():
             y = mx.nd.crop(x, begin=tuple(begin), end=tuple(end))
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
 
+            vx = mx.sym.Variable('x')
+            vy = mx.sym.crop(vx, begin=tuple(begin), end=tuple(end))
+            check_numeric_gradient(vy, [x.asnumpy()])
+
 
 def test_slice_axis():
     for ndim in range(1, 6):
@@ -1306,7 +1363,7 @@ def test_slice_axis():
 
             xgrad = mx.nd.empty(x.shape)
             exec1 = Y.bind(default_context(), args = [x], args_grad = {'X': xgrad})
-            exec1.forward()
+            exec1.forward(is_train=True)
             y = exec1.outputs[0]
             assert_allclose(x.asnumpy()[idx], y.asnumpy())
             exec1.backward([y])
@@ -1317,7 +1374,7 @@ def test_slice_axis():
             x_grad_npy = np.random.normal(size=x.shape)
             xgrad = mx.nd.array(x_grad_npy)
             exec2 = Y.bind(default_context(), args=[x], args_grad={'X': xgrad}, grad_req="add")
-            exec2.forward()
+            exec2.forward(is_train=True)
             exec2.backward([exec2.outputs[0]])
             xx = np.zeros(shape=x.shape, dtype=np.float32)
             xx[idx] = x.asnumpy()[idx]
@@ -1590,7 +1647,7 @@ def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2
     exe1.arg_dict['img2'][:] = img2
 
     #cpu forward
-    exe1.forward()
+    exe1.forward(is_train=True)
     # python forward
     forward_result,tmp1,tmp2 = correlation_forward(img1,img2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply)
 
@@ -1638,7 +1695,7 @@ def test_support_vector_machine_l1_svm():
 
     grad = mx.nd.empty(shape, ctx = xpu)
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward()
+    exec1.forward(is_train=True)
 
     assert_almost_equal(x_np, exec1.outputs[0].asnumpy())
 
@@ -1667,7 +1724,7 @@ def test_support_vector_machine_l2_svm():
 
     grad = mx.nd.empty(shape, ctx = xpu)
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward()
+    exec1.forward(is_train=True)
 
     assert_almost_equal(x_np, exec1.outputs[0].asnumpy())
 
@@ -1708,7 +1765,7 @@ def check_pad_with_shape(shape, xpu, pad_width, mode):
     # mxnet result
     grad = mx.nd.empty(shape, ctx = xpu)
     exec1 = Y.bind(xpu, args = [x], args_grad = {'X': grad})
-    exec1.forward()
+    exec1.forward(is_train=True)
     out = exec1.outputs[0].asnumpy()
     # compare numpy + mxnet
     assert_almost_equal(out, np_out)
@@ -1762,6 +1819,8 @@ def check_instance_norm_with_shape(shape, xpu):
                            numeric_eps=1e-2, rtol=1e-2, atol=1e-2)
 
 def test_instance_normalization():
+    check_instance_norm_with_shape((1, 1, 1), default_context())
+    check_instance_norm_with_shape((2, 1, 2), default_context())
     check_instance_norm_with_shape((2,4,5,6), default_context())
     check_instance_norm_with_shape((3,3,2,3,2,1,1), default_context())
 
@@ -1811,7 +1870,7 @@ def sequence_mask_numpy(array, lengths, value):
     shape = array.shape
     batch = shape[1]
     for i in range(batch):
-        arrayMask[int(lengths[i]):, i] = value 
+        arrayMask[int(lengths[i]):, i] = value
     return arrayMask
 
 def check_sequence_mask(shape, xpu, mask_value):
@@ -1828,7 +1887,7 @@ def check_sequence_mask(shape, xpu, mask_value):
     gradX = mx.nd.empty(shape, ctx = xpu)
     gradL = mx.nd.empty((shape[1]), ctx = xpu)
     exec1 = Y.bind(xpu, args = [x, l], grad_req={'X':'write', 'L':'null'}, args_grad = {'X':gradX, 'L':gradL})
-    exec1.forward()
+    exec1.forward(is_train=True)
     out = exec1.outputs[0].asnumpy()
     # compare numpy + mxnet
     assert_almost_equal(out, np_out, rtol=1e-5)
@@ -1866,7 +1925,7 @@ def mathematical_core_binary(name,
 
     test = forward_mxnet_call(data1, data2)
     exe_test = test.bind(default_context(), args=[arr_data1, arr_data2], args_grad=[arr_grad1, arr_grad2])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout = forward_numpy_call(data_tmp1, data_tmp2)
     assert_almost_equal(out, npout)
@@ -1897,7 +1956,7 @@ def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_num
 
     test = forward_mxnet_call(data)
     exe_test = test.bind(default_context(), args=[arr_data], args_grad=[arr_grad])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout = forward_numpy_call(data_tmp)
     assert_almost_equal(out, npout)
@@ -1939,7 +1998,7 @@ def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_in
 
     test = forward_mxnet_call(data)
     exe_test = test.bind(default_context(), args=[arr_data])
-    exe_test.forward()
+    exe_test.forward(is_train=True)
     out = exe_test.outputs[0].asnumpy()
     npout = forward_numpy_call(data_tmp)
     assert_almost_equal(out, npout)
@@ -2058,7 +2117,7 @@ def test_init():
     def test_basic_val_init(sym_func, np_func, shape, dtype):
         x = sym_func(shape=shape, dtype=dtype)
         exe = x.bind(default_context(), args=[], args_grad=[])
-        exe.forward()
+        exe.forward(is_train=True)
         assert_almost_equal(exe.outputs[0].asnumpy(), np_func(shape=shape, dtype=dtype))
         assert exe.outputs[0].asnumpy().dtype == dtype
     def test_arange():
@@ -2190,7 +2249,7 @@ def test_blockgrad():
 
 def test_take():
     def check_output_n_grad(data_shape, idx_shape):
-        exe = result.simple_bind(default_context(), a=data_shape, 
+        exe = result.simple_bind(default_context(), a=data_shape,
                                  indices=idx_shape)
         data_real = np.random.normal(size=data_shape).astype('float32')
         idx_real = np.random.randint(low=0, high=data_shape[0], size=idx_shape)
@@ -2199,7 +2258,7 @@ def check_output_n_grad(data_shape, idx_shape):
 
         exe.arg_dict['a'][:] = mx.nd.array(data_real)
         exe.arg_dict['indices'][:] = mx.nd.array(idx_real)
-        exe.forward()
+        exe.forward(is_train=True)
         assert_almost_equal(exe.outputs[0].asnumpy(), data_real[idx_real])
 
         for i in np.nditer(idx_real):
@@ -2220,7 +2279,7 @@ def check_output_n_grad(data_shape, idx_shape):
                 data_shape += (np.random.randint(low=3, high=6), )
             idx_shape = ()
             for _ in range(idx_ndim):
-                idx_shape += (np.random.randint(low=3, high=5), ) 
+                idx_shape += (np.random.randint(low=3, high=5), )
             check_output_n_grad(data_shape, idx_shape)
 
 
@@ -2231,17 +2290,17 @@ def test_grid_generator():
         affine_matrix =  mx.sym.Variable('affine')
         grid = mx.sym.GridGenerator(data=affine_matrix,transform_type='affine', target_shape=target_shape)
         exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='write')
-        
+
         # check forward
         exe.arg_dict['affine'][:] = np.array([[1.0,0,0,0,1.0,0]])
-        exe.forward()
+        exe.forward(is_train=True)
         output = exe.outputs[0].asnumpy()
         output[0,0,:,:] = (output[0,0,:,:] + 1) * (target_shape[1] - 1) / 2.0
         output[0,1,:,:] = (output[0,1,:,:] + 1) * (target_shape[0] - 1) / 2.0
         xv, yv = np.meshgrid(np.arange(target_shape[0]), np.arange(target_shape[1]))
         assert_almost_equal(output[0,0], yv.T)
         assert_almost_equal(output[0,1], xv.T)
-        
+
         # check backward
         out_grad = np.random.normal(size=(1,2)+target_shape)
         exe.backward(mx.nd.array(out_grad))
@@ -2250,15 +2309,15 @@ def test_grid_generator():
         tmp[1] = -1.0 + (np.arange(target_shape[0]*target_shape[1]) // target_shape[1]) * (2.0 / (target_shape[0]-1))
         tmp[2] = 1
         grad_est = np.dot(out_grad[0].reshape(2,target_shape[0]*target_shape[1]),tmp.T).reshape(1,6)
-        assert_almost_equal(exe.grad_dict['affine'].asnumpy(), grad_est, rtol=1e-3)
+        assert_almost_equal(exe.grad_dict['affine'].asnumpy(), grad_est, rtol=1e-3, atol=1e-5)
         # check addto
         exe = grid.simple_bind(ctx=default_context(), affine=(1,6), grad_req='add')
         grid_grad_npy = np.random.normal(size=exe.grad_dict['affine'].shape)
         exe.grad_dict['affine'][:] = grid_grad_npy
         exe.arg_dict['affine'][:] = np.array([[1.0, 0, 0, 0, 1.0, 0]])
-        exe.forward()
+        exe.forward(is_train=True)
         exe.backward(mx.nd.array(out_grad))
-        assert_almost_equal(exe.grad_dict['affine'].asnumpy(), grad_est + grid_grad_npy, rtol=1e-2)
+        assert_almost_equal(exe.grad_dict['affine'].asnumpy(), grad_est + grid_grad_npy, rtol=1e-2, atol=1e-5)
 
     # transform_type = warp
     test_case = [(12,21),(4,3),(6,12)]
@@ -2268,7 +2327,7 @@ def test_grid_generator():
         exe = grid.simple_bind(ctx=default_context(), flow=(1,2)+target_shape, grad_req='write')
         # check forward
         exe.arg_dict['flow'][:] = np.ones((1,2)+target_shape)
-        exe.forward()
+        exe.forward(is_train=True)
         output = exe.outputs[0].asnumpy()
         output[0,0,:,:] = (output[0,0,:,:] + 1) * (target_shape[1] - 1) / 2.0
         output[0,1,:,:] = (output[0,1,:,:] + 1) * (target_shape[0] - 1) / 2.0
@@ -2287,7 +2346,7 @@ def test_grid_generator():
         flow_grad_npy = np.random.normal(size=exe_add.grad_dict['flow'].shape)
         exe_add.arg_dict['flow'][:] = np.ones((1, 2) + target_shape)
         exe_add.grad_dict['flow'][:] = flow_grad_npy
-        exe_add.forward()
+        exe_add.forward(is_train=True)
         exe_add.backward(mx.nd.array(out_grad))
         assert_almost_equal(exe_add.grad_dict['flow'].asnumpy(), grad_est + flow_grad_npy, rtol=1e-3, atol=1e-5)
 
@@ -2319,7 +2378,7 @@ def bilinear_forward_numpy(data, grid):
 
                     xInTopLeft = int(floor(xcoord))
                     xWeightTopLeft = np.float32(1-(xcoord - xInTopLeft))
-                    
+
                     yInTopLeft = int(floor(ycoord))
                     yWeightTopLeft = np.float32(1-(ycoord - yInTopLeft))
 
@@ -2334,7 +2393,7 @@ def bilinear_forward_numpy(data, grid):
                             if between(xInTopLeft,0,input_width-1) and between(yInTopLeft+1,0,input_height-1) else 0.0
                         inBottomRight = data[i,channel,yInTopLeft+1, xInTopLeft+1] \
                             if between(xInTopLeft+1,0,input_width-1) and between(yInTopLeft+1,0,input_height-1) else 0.0
-                        
+
                         out[i,channel,yout,xout] = xWeightTopLeft * yWeightTopLeft * inTopLeft\
                                 +  (1-xWeightTopLeft)*yWeightTopLeft * inTopRight\
                                 +  xWeightTopLeft * (1-yWeightTopLeft) * inBottomLeft\
@@ -2357,24 +2416,24 @@ def bilinear_backward_numpy(out_grad, data, grid):
         for i in range(batchsize):
             for yout in range(output_height):
                 for xout in range(output_width):
-                    
+
                     top_left_y_gw = np.float32(0.0);
                     top_left_x_gw = np.float32(0.0);
-            
+
                     xcoord = np.float32((grid[i, 0, yout, xout] + 1) * (input_width-1) / 2.0)
                     ycoord = np.float32((grid[i, 1, yout, xout] + 1) * (input_height-1) / 2.0)
 
                     xInTopLeft = int(floor(xcoord))
                     xWeightTopLeft = np.float32(1-(xcoord - xInTopLeft))
-                    
+
                     yInTopLeft = int(floor(ycoord))
                     yWeightTopLeft = np.float32(1-(ycoord - yInTopLeft))
-                    
+
                     topLeftDotProduct = np.float32(0)
                     topRightDotProduct = np.float32(0)
                     bottomLeftDotProduct = np.float32(0)
                     bottomRightDotProduct = np.float32(0)
-                        
+
                     for channel in range(num_channel):
                         # left top
                         if between(xInTopLeft,0,input_width-1) and between(yInTopLeft,0,input_height-1):
@@ -2385,7 +2444,7 @@ def bilinear_backward_numpy(out_grad, data, grid):
                         # right top
                         if between(xInTopLeft+1,0,input_width-1) and between(yInTopLeft,0,input_height-1):
                             topRightDotProduct += data[i, channel, yInTopLeft,xInTopLeft+1] * \
-                                out_grad[i, channel, yout,xout] 
+                                out_grad[i, channel, yout,xout]
                             data_grad[i, channel,yInTopLeft, xInTopLeft+1] += (1-xWeightTopLeft) * \
                                 yWeightTopLeft * out_grad[i,channel,yout,xout]
                         # left bottom
@@ -2408,18 +2467,18 @@ def bilinear_backward_numpy(out_grad, data, grid):
 
                     grid_grad[i,0,yout,xout] = xf * (input_width-1) / 2.0
                     grid_grad[i,1,yout,xout] = yf * (input_height-1) / 2.0
-                    
+
         return data_grad, grid_grad
-    
+
     data = mx.sym.Variable('data')
     grid = mx.sym.Variable('grid')
     net = mx.sym.BilinearSampler(data=data,grid=grid)
-    
+
     test_case = [[(1,3,15,16),(1,2,10,10)],
                  [(1,6,7,16),(1,2,10,4)],
                  [(1,7,3,16),(1,2,8,11)],
                  [(1,9,50,50),(1,2,50,50)]]
-    
+
     for ctx in [default_context()]:
         for item in test_case:
             data_shape, grid_shape = item
@@ -2427,14 +2486,14 @@ def bilinear_backward_numpy(out_grad, data, grid):
             # check forward
             exe.arg_dict['data'][:] = np.random.uniform(low=-0.1, high=0.1,size=data_shape).astype(np.float32)
             exe.arg_dict['grid'][:] = np.random.uniform(low=-2, high=2, size=grid_shape).astype(np.float32)
-            exe.forward()
+            exe.forward(is_train=True)
             out = bilinear_forward_numpy(exe.arg_dict['data'].asnumpy(), exe.arg_dict['grid'].asnumpy())
             assert_almost_equal(exe.outputs[0].asnumpy(), out, rtol=1e-3,atol=1e-5)
 
             # check backward
             out_grad = np.random.uniform(low=-0.01, high=0.01,size=data_shape[:2] + grid_shape[2:]).astype(np.float32)
             exe.backward(mx.nd.array(out_grad))
-            data_grad, grid_grad = bilinear_backward_numpy(out_grad,exe.arg_dict['data'].asnumpy(), 
+            data_grad, grid_grad = bilinear_backward_numpy(out_grad,exe.arg_dict['data'].asnumpy(),
                                                        exe.arg_dict['grid'].asnumpy())
             assert_almost_equal(exe.grad_dict['data'].asnumpy(), data_grad, rtol=1e-3, atol=1e-5)
             assert_almost_equal(exe.grad_dict['grid'].asnumpy(), grid_grad, rtol=1e-3, atol=1e-5)
@@ -2447,11 +2506,11 @@ def bilinear_backward_numpy(out_grad, data, grid):
             exe_addto.arg_dict['grid'][:] = exe.arg_dict['grid'][:]
             exe_addto.grad_dict['data'][:] = data_initial_grid
             exe_addto.grad_dict['grid'][:] = grid_initial_grid
-            exe_addto.forward()
+            exe_addto.forward(is_train=True)
             exe_addto.backward(mx.nd.array(out_grad))
             assert_almost_equal(exe_addto.grad_dict['data'].asnumpy(), data_grad + data_initial_grid, rtol=1e-3,atol=1e-5)
             assert_almost_equal(exe_addto.grad_dict['grid'].asnumpy(), grid_grad + grid_initial_grid, rtol=1e-3,atol=1e-5)
-            
+
 def test_index2d():
     for _ in range(30):
         n = np.random.randint(1, 100)
@@ -2471,7 +2530,7 @@ def test_cast():
             assert exe.outputs[0].dtype == dsttype
             X = np.random.uniform(-10, 10, size=(10, 10))
             exe.arg_arrays[0][:] = X
-            exe.forward()
+            exe.forward(is_train=True)
             exe.backward(mx.nd.array(X, dtype=dsttype, ctx=default_context()))
             assert_almost_equal(exe.outputs[0].asnumpy(), X.astype(srctype).astype(dsttype), rtol=1e-3)
             assert_almost_equal(exe.grad_arrays[0].asnumpy(), X.astype(dsttype).astype(srctype), rtol=1e-3)
@@ -2551,6 +2610,17 @@ def test_repeat_numeric_gradient():
     test_repeat_numeric_gradient()
 
 
+def test_reverse():
+    data = mx.symbol.Variable('data')
+    shape = (5, 5, 5)
+    data_tmp = np.random.uniform(-1, 1, shape)
+    test = mx.sym.reverse(data, axis=[1, 2])
+    grad = np.random.uniform(-1, 1, shape)
+    check_numeric_gradient(test, [data_tmp], numeric_eps=2E-2)
+    check_symbolic_forward(test, [data_tmp], [data_tmp[:, ::-1, ::-1]])
+    check_symbolic_backward(test, [data_tmp], [grad], [grad[:, ::-1, ::-1]])
+
+
 def test_tile():
     def test_normal_case():
         ndim_max = 3 # max number of dims of the ndarray
@@ -2647,7 +2717,7 @@ def test_tile_numeric_gradient():
 
 
 def test_one_hot():
-    def test_normal_case():
+    def test_normal_case(index_type=np.int32):
         ndim_max = 6
         dim_size_max = 20
         depth = int(dim_size_max / 2)
@@ -2660,7 +2730,7 @@ def test_normal_case():
             indices = np.random.randint(-dim_size_max, dim_size_max+1,
                                         size=np.prod(shape)).reshape(shape)
             mx_one_hot_array = mx.nd.one_hot(
-                mx.nd.array(indices, ctx=default_context(), dtype=np.int32),
+                mx.nd.array(indices, ctx=default_context(), dtype=index_type),
                 depth=depth, dtype=np.int32)
             expected_array = np.zeros((np.prod(shape), depth), dtype=np.int32)
             expected_array[:] = off_value
@@ -2694,7 +2764,10 @@ def test_zero_depth():
         expected_array = np.array([], dtype=np.int32).reshape(shape + (depth, ))
         assert same(expected_array, mx_one_hot_array)
 
-    test_normal_case()
+    test_normal_case(index_type=np.int32)
+    test_normal_case(index_type=np.float64)
+    test_normal_case(index_type=np.float32)
+    test_normal_case(index_type=np.float16)
     test_empty_indices()
     test_zero_depth()
 
@@ -2834,7 +2907,110 @@ def test_where_numeric_gradient(shape, same_shape):
     test_where_numeric_gradient((5, 7, 9), True)
     test_where_numeric_gradient((5, 7, 9), False)
 
+
+def test_new_softmax():
+    for ndim in range(1, 5):
+        for _ in range(5):
+            shape = np.random.randint(1, 5, size=ndim)
+            axis = np.random.randint(0, ndim)
+            data = np.random.uniform(-2, 2, size=shape)
+            sym = mx.sym.softmax(axis=axis)
+            check_symbolic_forward(sym, [data], [np_softmax(data, axis=axis)])
+            check_numeric_gradient(sym, [data], rtol=0.05, atol=1e-3)
+
+
+def test_log_softmax():
+    for ndim in range(1, 5):
+        for _ in range(5):
+            shape = np.random.randint(1, 5, size=ndim)
+            axis = np.random.randint(0, ndim)
+            data = np.random.uniform(-2, 2, size=shape)
+            sym = mx.sym.log_softmax(axis=axis-ndim)
+            check_symbolic_forward(sym, [data], [np.log(np_softmax(data, axis=axis)+1e-20)])
+            check_numeric_gradient(sym, [data], rtol=0.05, atol=1e-3)
+
+
+def test_pick():
+    def test_pick_helper(index_type=np.int32):
+        for _ in range(100):
+            ndim = np.random.randint(1, 5)
+            bshape = np.random.randint(1, 10, size=ndim)
+            axis = np.random.randint(0, ndim)
+            sshape = bshape.copy()
+            sshape[axis] = 1
+            data = np.random.uniform(-1, 1, size=bshape)
+            index = np.random.randint(0, bshape[axis], size=sshape)
+            exp = []
+            for i in range(ndim):
+                if i == axis:
+                    exp.append(index)
+                else:
+                    ishape = [1 for _ in range(ndim)]
+                    ishape[i] = bshape[i]
+                    exp.append(np.arange(bshape[i]).reshape(ishape))
+            expected = data[exp]
+            data = mx.nd.array(data, dtype='float32')
+            index = mx.nd.array(index, dtype=index_type)
+            out = mx.nd.pick(data, index, axis=axis, keepdims=True)
+            assert_almost_equal(out.asnumpy(), expected)
+
+            data_holder = data
+            index_holder = index
+            data = mx.sym.Variable('data')
+            index = mx.sym.Variable('index')
+            sym = mx.sym.pick(data, index, axis=axis, keepdims=True)
+            check_numeric_gradient(sym, [data_holder, index_holder], grad_nodes=['data'])
+
+    test_pick_helper(np.int32)
+    test_pick_helper(np.float32)
+
+
+def test_custom_op():
+    class Sqr(mx.operator.CustomOp):
+        def forward(self, is_train, req, in_data, out_data, aux):
+            self.assign(out_data[0], req[0], in_data[0]*in_data[0])
+
+        def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+            self.assign(in_grad[0], req[0], 2*in_data[0]*out_grad[0])
+
+    @mx.operator.register("sqr")
+    class SqrProp(mx.operator.CustomOpProp):
+        def __init__(self):
+            super(SqrProp, self).__init__(need_top_grad=True)
+
+        def list_arguments(self):
+            return ['data']
+
+        def list_outputs(self):
+            return ['output']
+
+        def infer_shape(self, in_shape):
+            return in_shape, [in_shape[0]], []
+
+        def infer_type(self, in_type):
+            return in_type, [in_type[0]], []
+
+        def create_operator(self, ctx, shapes, dtypes):
+            return Sqr()
+
+    data = mx.symbol.Variable('data')
+    op = mx.symbol.Custom(data=data, name='sqr', op_type='sqr')
+    x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10)))
+    check_numeric_gradient(op, [x])
+
+    data = mx.symbol.Variable('data')
+    data = mx.symbol.cast(data, dtype='float64')
+    op = mx.symbol.Custom(data=data, name='sqr', op_type='sqr')
+    op = mx.symbol.cast(op, dtype='float32')
+    x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10)))
+    check_numeric_gradient(op, [x])
+
+
 if __name__ == '__main__':
+    test_custom_op()
+    test_log_softmax()
+    test_new_softmax()
+    test_pick()
     test_l2_normalization()
     test_sequence_mask()
     test_roipooling()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index b6bb37f2118e..923853acb7be 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -94,7 +94,9 @@ def update(self, index, weight, grad, state):
 
         t = self._index_update_count[index]
         mean, variance = state
-        grad *= self.rescale_grad
+
+        wd = self._get_wd(index)
+        grad = grad * self.rescale_grad + wd * weight
         if self.clip_gradient is not None:
             mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
 
@@ -110,10 +112,6 @@ def update(self, index, weight, grad, state):
 
         weight -= lr*mean/(mx.nd.sqrt(variance) + self.epsilon)
 
-        wd = self._get_wd(index)
-        if wd > 0.:
-            weight[:] -= (lr * wd) * weight
-
 
 def test_adam():
     mx.random.seed(0)
@@ -131,50 +129,68 @@ def test_adam():
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape)
 
 # RMSProp
-
 class PyRMSProp(mx.optimizer.Optimizer):
     """RMSProp optimizer of Tieleman & Hinton, 2012,
 
-    This code follows the version in  http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45)
-    by Alex Graves, 2013.
+    For centered=False, the code follows the version in
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012
+
+    For centered=True, the code follows the version in
+    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
 
     Parameters
     ----------
     learning_rate : float, optional
         Step size.
-        Default value is set to 0.002.
+        Default value is set to 0.001.
     gamma1: float, optional
         decay factor of moving average for gradient, gradient^2.
-        Default value is set to 0.95.
+        Default value is set to 0.9.
     gamma2: float, optional
         "momentum" factor.
         Default value if set to 0.9.
+        Only used if centered=True
+    epsilon : float, optional
+        Default value is set to 1e-8.
+    centered : boolean, optional
+        Use Graves or Tielemans & Hintons version of RMSProp
     wd : float, optional
         L2 regularization coefficient add to all the weights
     rescale_grad : float, optional
         rescaling factor of gradient.
     clip_gradient : float, optional
         clip gradient in range [-clip_gradient, clip_gradient]
+    clip_weights : float, optional
+        clip weights in range [-clip_weights, clip_weights]
+
     """
-    def __init__(self, learning_rate=0.001, gamma1=0.95, gamma2=0.9,
-                 epsilon=1e-8, **kwargs):
+    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
+                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
         super(PyRMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.centered = centered
         self.gamma1 = gamma1
         self.gamma2 = gamma2
         self.epsilon = epsilon
+        self.clip_weights = clip_weights
 
     def create_state(self, index, weight):
-        """Create additional optimizer state: mean, variance
+        """Create additional optimizer state.
+
+        For centered=False: n
+        For centered=True: n, g, delta
 
         Parameters
         ----------
         weight : NDArray
             The weight data
-
         """
-        return (mx.nd.zeros(weight.shape, weight.context),  # n
-                mx.nd.zeros(weight.shape, weight.context),  # g
-                mx.nd.zeros(weight.shape, weight.context))  # delta
+        if self.centered:
+            return (mx.nd.zeros(weight.shape, weight.context),  # n
+                    mx.nd.zeros(weight.shape, weight.context),  # g
+                    mx.nd.zeros(weight.shape, weight.context))  # delta
+        else:
+            return (mx.nd.zeros(weight.shape, weight.context), )  # n
 
     def update(self, index, weight, grad, state):
         """Update the parameters.
@@ -196,14 +212,26 @@ def update(self, index, weight, grad, state):
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
-        n, g, delta = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-        g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
-        delta[:] = (self.gamma2) * delta - lr * (grad/(mx.nd.sqrt(n - g*g) + self.epsilon) + wd * weight)
-        weight[:] += delta
+        grad = grad * self.rescale_grad + wd * weight
+
+        if not self.centered:
+            (n, ) = state
+            if self.clip_gradient is not None:
+                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
+            weight[:] -= lr * grad/(mx.nd.sqrt(n) + self.epsilon)
+
+        else:
+            n, g, delta = state
+            if self.clip_gradient is not None:
+                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
+            g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
+            delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g) + self.epsilon)
+            weight[:] += delta
+
+        if self.clip_weights:
+             mx.ndarray.clip(weight, -self.clip_weights, self.clip_weights, out=weight)
 
 def test_rms():
     mx.random.seed(0)
@@ -216,7 +244,27 @@ def test_rms():
               {'rescale_grad': 0.8},
               {'clip_gradient': 0.5, 'wd': 0.07},
               {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
-              {'rescale_grad': 0.8, 'wd': 0.05}]
+              {'rescale_grad': 0.8, 'wd': 0.05},
+              {'centered': True},
+              {'clip_gradient': 0.5, 'centered': True},
+              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'centered': True},
+              {'rescale_grad': 0.8, 'centered': True},
+              {'clip_gradient': 0.5, 'wd': 0.07, 'centered': True},
+              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03, 'centered': True},
+              {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True},
+              {'clip_gradient': 0.5, 'clip_weights': 0.01},
+              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'clip_weights': 0.01},
+              {'rescale_grad': 0.8, 'clip_weights': 0.01},
+              {'clip_gradient': 0.5, 'wd': 0.07, 'clip_weights': 0.01},
+              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03, 'clip_weights': 0.01},
+              {'rescale_grad': 0.8, 'wd': 0.05, 'clip_weights': 0.01},
+              {'centered': True, 'clip_weights': 0.01},
+              {'clip_gradient': 0.5, 'centered': True, 'clip_weights': 0.01},
+              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'centered': True, 'clip_weights': 0.01},
+              {'rescale_grad': 0.8, 'centered': True, 'clip_weights': 0.01},
+              {'clip_gradient': 0.5, 'wd': 0.07, 'centered': True, 'clip_weights': 0.01},
+              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03, 'centered': True, 'clip_weights': 0.01},
+              {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True, 'clip_weights': 0.01}]
     for kwarg in kwargs:
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape)
 
diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
new file mode 100644
index 000000000000..9a0deabdd9f8
--- /dev/null
+++ b/tests/python/unittest/test_profiler.py
@@ -0,0 +1,47 @@
+from __future__ import print_function
+import mxnet as mx
+from mxnet import profiler
+import time
+import numpy as np
+
+def test_profiler():
+    profile_filename = "test_profile.json"
+    iter_num = 100
+    begin_profiling_iter = 50
+    end_profiling_iter = 50
+
+
+    profiler.profiler_set_config(mode='symbolic', filename=profile_filename)
+    print('profile file save to {0}'.format(profile_filename))
+
+    A = mx.sym.Variable('A')
+    B = mx.sym.Variable('B')
+    C = mx.symbol.dot(A, B)
+
+    executor = C.simple_bind(mx.cpu(1), 'write', A=(4096, 4096), B=(4096, 4096))
+
+    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
+    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
+
+    a.copyto(executor.arg_dict['A'])
+    b.copyto(executor.arg_dict['B'])
+
+    flag = False
+    print("execution begin")
+    for i in range(iter_num):
+        if i == begin_profiling_iter:
+            t0 = time.clock()
+            profiler.profiler_set_state('run')
+        if i == end_profiling_iter:
+            t1 = time.clock()
+            profiler.profiler_set_state('stop')
+        executor.forward()
+        c = executor.outputs[0]
+        c.wait_to_read()
+    print("execution end")
+    duration = t1 - t0
+    print('duration: {0}s'.format(duration))
+    print('          {0}ms/operator'.format(duration*1000/iter_num))
+
+if __name__ == '__main__':
+    test_profiler()
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 0a19c36616bf..f5446040fd28 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -32,7 +32,7 @@ def check_symbolic_random(dev):
     xgrad = mx.nd.zeros(shape, ctx=dev)
     yexec = Y.bind(dev, {'X' : x}, {'X': xgrad})
     mx.random.seed(128)
-    yexec.forward()
+    yexec.forward(is_train=True)
     yexec.backward(yexec.outputs[0])
     un1 = (yexec.outputs[0] - x).copyto(dev)
     assert same(xgrad.asnumpy(), un1.asnumpy())
diff --git a/tests/python/unittest/test_rnn.py b/tests/python/unittest/test_rnn.py
index 9e92300cabd9..06c8fd595f34 100644
--- a/tests/python/unittest/test_rnn.py
+++ b/tests/python/unittest/test_rnn.py
@@ -1,9 +1,12 @@
 import mxnet as mx
 import numpy as np
+from numpy.testing import assert_allclose
+
 
 def test_rnn():
     cell = mx.rnn.RNNCell(100, prefix='rnn_')
-    outputs, _ = cell.unroll(3, input_prefix='rnn_')
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
     outputs = mx.sym.Group(outputs)
     assert sorted(cell.params._params.keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
     assert outputs.list_outputs() == ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']
@@ -11,9 +14,11 @@ def test_rnn():
     args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
+
 def test_lstm():
-    cell = mx.rnn.LSTMCell(100, prefix='rnn_')
-    outputs, _ = cell.unroll(3, input_prefix='rnn_')
+    cell = mx.rnn.LSTMCell(100, prefix='rnn_', forget_bias=1.0)
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
     outputs = mx.sym.Group(outputs)
     assert sorted(cell.params._params.keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
     assert outputs.list_outputs() == ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']
@@ -21,11 +26,46 @@ def test_lstm():
     args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
+
+def test_lstm_forget_bias():
+    forget_bias = 2.0
+    stack = mx.rnn.SequentialRNNCell()
+    stack.add(mx.rnn.LSTMCell(100, forget_bias=forget_bias, prefix='l0_'))
+    stack.add(mx.rnn.LSTMCell(100, forget_bias=forget_bias, prefix='l1_'))
+
+    dshape = (32, 1, 200)
+    data = mx.sym.Variable('data')
+
+    sym, _ = stack.unroll(1, data, merge_outputs=True)
+    mod = mx.mod.Module(sym, label_names=None, context=mx.cpu(0))
+    mod.bind(data_shapes=[('data', dshape)], label_shapes=None)
+
+    mod.init_params()
+
+    bias_argument = next(x for x in sym.list_arguments() if x.endswith('i2h_bias'))
+    expected_bias = np.hstack([np.zeros((100,)),
+                               forget_bias * np.ones(100, ), np.zeros((2 * 100,))])
+    assert_allclose(mod.get_params()[0][bias_argument].asnumpy(), expected_bias)
+
+
+def test_gru():
+    cell = mx.rnn.GRUCell(100, prefix='rnn_')
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
+    outputs = mx.sym.Group(outputs)
+    assert sorted(cell.params._params.keys()) == ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']
+    assert outputs.list_outputs() == ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']
+
+    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+    assert outs == [(10, 100), (10, 100), (10, 100)]
+
+
 def test_stack():
     cell = mx.rnn.SequentialRNNCell()
     for i in range(5):
         cell.add(mx.rnn.LSTMCell(100, prefix='rnn_stack%d_'%i))
-    outputs, _ = cell.unroll(3, input_prefix='rnn_')
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
     outputs = mx.sym.Group(outputs)
     keys = sorted(cell.params._params.keys())
     for i in range(5):
@@ -38,7 +78,51 @@ def test_stack():
     args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
+
+def test_bidirectional():
+    cell = mx.rnn.BidirectionalCell(
+            mx.rnn.LSTMCell(100, prefix='rnn_l0_'),
+            mx.rnn.LSTMCell(100, prefix='rnn_r0_'),
+            output_prefix='rnn_bi_')
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
+    outputs = mx.sym.Group(outputs)
+    assert outputs.list_outputs() == ['rnn_bi_t0_output', 'rnn_bi_t1_output', 'rnn_bi_t2_output']
+
+    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+    assert outs == [(10, 200), (10, 200), (10, 200)]
+
+
+def test_zoneout():
+    cell = mx.rnn.ZoneoutCell(mx.rnn.RNNCell(100, prefix='rnn_'), zoneout_outputs=0.5,
+                              zoneout_states=0.5)
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
+    outputs = mx.sym.Group(outputs)
+
+    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+    assert outs == [(10, 100), (10, 100), (10, 100)]
+
+
+def test_unfuse():
+    cell = mx.rnn.FusedRNNCell(100, num_layers=3, mode='lstm',
+                               prefix='test_', bidirectional=True,
+                               dropout=0.5)
+    cell = cell.unfuse()
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs)
+    outputs = mx.sym.Group(outputs)
+    assert outputs.list_outputs() == ['test_bi_l2_t0_output', 'test_bi_l2_t1_output', 'test_bi_l2_t2_output']
+
+    args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+    assert outs == [(10, 200), (10, 200), (10, 200)]
+
+
 if __name__ == '__main__':
     test_rnn()
     test_lstm()
+    test_lstm_forget_bias()
+    test_gru()
     test_stack()
+    test_bidirectional()
+    test_unfuse()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index ca4ceeea2325..ab25f48eeb52 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -1,5 +1,6 @@
 import copy
 import os
+import re
 import mxnet as mx
 import numpy as np
 from common import models
@@ -43,13 +44,29 @@ def test_symbol_internal():
     data = mx.symbol.Variable('data')
     oldfc = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=10)
     net1 = mx.symbol.FullyConnected(data=oldfc, name='fc2', num_hidden=100)
-    net1.list_arguments() == ['data',
-                              'fc1_weight', 'fc1_bias',
-                              'fc2_weight', 'fc2_bias']
+    assert net1.list_arguments() == ['data', 'fc1_weight', 'fc1_bias', 'fc2_weight', 'fc2_bias']
+
     internal =  net1.get_internals()
     fc1 = internal['fc1_output']
     assert fc1.list_arguments() == oldfc.list_arguments()
 
+def test_symbol_children():
+    data = mx.symbol.Variable('data')
+    oldfc = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=10)
+    net1 = mx.symbol.FullyConnected(data=oldfc, name='fc2', num_hidden=100)
+
+    assert net1.get_children().list_outputs() == ['fc1_output', 'fc2_weight', 'fc2_bias']
+    assert net1.get_children().get_children().list_outputs() == ['data', 'fc1_weight', 'fc1_bias']
+    assert net1.get_children()['fc2_weight'].list_arguments() == ['fc2_weight']
+    assert net1.get_children()['fc2_weight'].get_children() is None
+
+    data = mx.sym.Variable('data')
+    sliced = mx.sym.SliceChannel(data, num_outputs=3, name='slice')
+    concat = mx.sym.Concat(*list(sliced))
+
+    assert concat.get_children().list_outputs() == \
+        ['slice_output0', 'slice_output1', 'slice_output2']
+    assert sliced.get_children().list_outputs() == ['data']
 
 def test_symbol_pickle():
     mlist = [models.mlp2(), models.conv()]
@@ -165,7 +182,53 @@ def test_load_000800():
         {'ctx': mx.cpu(0), 'group2ctx': {'stage1' : mx.cpu(1), 'stage2' : mx.cpu(2)}, 'data': (1,200)})
 
 
+def test_blockgrad():
+    a = mx.sym.Variable('a')
+    b = mx.sym.BlockGrad(2*a)
+    exe = b.simple_bind(ctx=mx.cpu(), a=(10,10))
+
+
+def test_zero_prop():
+    data = mx.symbol.Variable('data')
+    for i in range(10):
+        data = data * data
+
+    exe = data.simple_bind(ctx=mx.cpu(), data=(10, 3, 256, 256))
+    big = int(re.search('Total (\d+) MB allocated', exe.debug_str()).group(1))
+
+    exe = data.simple_bind(ctx=mx.cpu(), data=(10, 3, 256, 256), grad_req='null')
+    small1 = int(re.search('Total (\d+) MB allocated', exe.debug_str()).group(1))
+
+    data = mx.sym.stop_gradient(data)
+    exe = data.simple_bind(ctx=mx.cpu(), data=(10, 3, 256, 256))
+    small2 = int(re.search('Total (\d+) MB allocated', exe.debug_str()).group(1))
+
+    assert big > small2
+    assert small1 == small2
+
+def test_zero_prop2():
+    x = mx.sym.Variable('x')
+    idx = mx.sym.Variable('idx')
+    y = mx.sym.batch_take(x, idx)
+    z = mx.sym.stop_gradient(y)
+    exe = z.simple_bind(ctx=mx.cpu(), x=(10, 10), idx=(10,),
+                        type_dict={'x': np.float32, 'idx': np.int32})
+    exe.forward()
+    exe.backward()
+
+    try:
+        y.simple_bind(ctx=mx.cpu(), x=(10, 10), idx=(10,),
+                      type_dict={'x': np.float32, 'idx': np.int32})
+    except:
+        return
+
+    assert False
+
 if __name__ == '__main__':
+    test_zero_prop2()
+    test_zero_prop()
+    test_blockgrad()
+    test_symbol_children()
     test_load_000800()
     test_symbol_infer_shape_var()
     test_symbol_infer_shape()
diff --git a/tests/python/unittest/test_viz.py b/tests/python/unittest/test_viz.py
index 2a605afc0a3a..79c86681035d 100644
--- a/tests/python/unittest/test_viz.py
+++ b/tests/python/unittest/test_viz.py
@@ -9,10 +9,11 @@ def test_print_summary():
     mp1 = mx.symbol.Pooling(data = act1, name = 'mp1', kernel=(2,2), stride=(2,2), pool_type='max')
     fc1 = mx.sym.FullyConnected(data=mp1, bias=bias, name='fc1', num_hidden=10, lr_mult=0)
     fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', num_hidden=10, wd_mult=0.5)
-    mx.viz.print_summary(fc2)
+    sc1 = mx.symbol.SliceChannel(data=fc2, num_outputs=10, name="slice_1", squeeze_axis=0)
+    mx.viz.print_summary(sc1)
     shape = {}
     shape["data"]=(1,3,28,28)
-    mx.viz.print_summary(fc2, shape)
+    mx.viz.print_summary(sc1, shape)
 
 if __name__ == "__main__":
     test_print_summary()
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index 71427d3c2af9..cff4196b6043 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -31,6 +31,7 @@ else
     echo "USE_BLAS=blas" >> config.mk
 fi
 echo "CXX=${CXX}" >>config.mk
+echo "USE_PROFILER=1" >> config.mk
 
 if [ ${TASK} == "build" ]; then
     if [ ${TRAVIS_OS_NAME} == "linux" ]; then
@@ -46,9 +47,7 @@ if [ ${TASK} == "cpp_test" ]; then
     echo "GTEST_PATH="${CACHE_PREFIX} >> config.mk
     make test || exit -1
     export MXNET_ENGINE_INFO=true
-    for test in tests/cpp/*_test; do
-        ./$test || exit -1
-    done
+    ./build/tests/cpp/mxnet_test
     exit 0
 fi
 
@@ -61,13 +60,13 @@ if [ ${TASK} == "r_test" ]; then
 
     set -e
     export _R_CHECK_TIMINGS_=0
-    
+
     if [[ ${TRAVIS_OS_NAME} == "osx" ]]; then
         wget https://cran.rstudio.com/bin/macosx/R-latest.pkg  -O /tmp/R-latest.pkg
         sudo installer -pkg "/tmp/R-latest.pkg" -target /
         Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
-    fi        
-    
+    fi
+
     cd R-package
     Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
     cd ..
@@ -102,15 +101,19 @@ if [ ${TASK} == "python_test" ]; then
     if [ ${TRAVIS_OS_NAME} == "osx" ]; then
         python -m nose tests/python/unittest || exit -1
         python3 -m nose tests/python/unittest || exit -1
-        make cython3
+        # make cython3
         # cython tests
-        export MXNET_ENFORCE_CYTHON=1
-        python3 -m nose tests/python/unittest || exit -1
+        # export MXNET_ENFORCE_CYTHON=1
+        # python3 -m nose tests/python/unittest || exit -1
         python3 -m nose tests/python/train || exit -1
+        python -m nose tests/python/doctest || exit -1
+        python3 -m nose tests/python/doctest || exit -1
     else
         nosetests tests/python/unittest || exit -1
         nosetests3 tests/python/unittest || exit -1
         nosetests3 tests/python/train || exit -1
+        nosetests tests/python/doctest || exit -1
+        nosetests3 tests/python/doctest || exit -1
     fi
     exit 0
 fi
@@ -143,3 +146,47 @@ if [ ${TASK} == "scala_test" ]; then
 
     exit 0
 fi
+
+if [ ${TASK} == "perl_test" ]; then
+    make all || exit -1
+
+    # use cached dir for storing data
+    MXNET_HOME=${PWD}
+    rm -rf ${MXNET_HOME}/perl-package/AI-MXNet/data
+    mkdir -p ${CACHE_PREFIX}/data
+    ln -s ${CACHE_PREFIX}/data ${MXNET_HOME}/perl-package/AI-MXNet/data
+
+    export LD_LIBRARY_PATH=${MXNET_HOME}/lib
+    export PERL5LIB=${HOME}/perl5/lib/perl5
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make || exit -1
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        install_name_tool -change lib/libmxnet.so \
+            ${MXNET_HOME}/lib/libmxnet.so \
+            blib/arch/auto/AI/MXNetCAPI/MXNetCAPI.bundle
+    fi
+    make install || exit -1
+
+    cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
+    perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+    make || exit -1
+    if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+        install_name_tool -change lib/libmxnet.so \
+            ${MXNET_HOME}/lib/libmxnet.so \
+            blib/arch/auto/AI/NNVMCAPI/NNVMCAPI.bundle
+    fi
+    make install || exit -1
+
+    cd ${MXNET_HOME}/perl-package/AI-MXNet/
+    perl Makefile.PL
+    make test || exit -1
+    exit 0
+fi
+
+if [ ${TASK} == "cpp_package_test" ]; then
+    MXNET_HOME=${PWD}
+    make travis -C ${MXNET_HOME}/cpp-package/example
+    exit 0
+fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 196181f70bc2..ec071009bda5 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -13,6 +13,7 @@ if [ ${TRAVIS_OS_NAME} == "osx" ]; then
     brew install fftw
     brew install libpng
     brew install ImageMagick
+    brew install swig
     if [ ${TASK} == "python_test" ]; then
         python -m pip install --user nose numpy cython
         python3 -m pip install --user nose numpy cython
@@ -29,3 +30,12 @@ if [ ${TASK} == "julia" ]; then
   export PATH="${PATH}:${HOME}/julia/bin"
   julia -e 'versioninfo()'
 fi
+
+if [ ${TASK} == "perl_test" ]; then
+    if [ ${TRAVIS_OS_NAME} == "linux" ]; then
+       cpanm -q -L "${HOME}/perl5" Function::Parameters
+    else
+       sudo sh -c 'curl -L https://cpanmin.us | perl - App::cpanminus'
+       sudo cpanm -q -n PDL Mouse Function::Parameters
+    fi
+fi
diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py
index 8f48aeb2e342..749f258d98df 100644
--- a/tools/bandwidth/measure.py
+++ b/tools/bandwidth/measure.py
@@ -1,7 +1,7 @@
 import os, sys
 curr_path = os.path.abspath(os.path.dirname(__file__))
 sys.path.insert(0, os.path.join(curr_path, "../../python"))
-sys.path.insert(0, os.path.join(curr_path, "../../example/image-classification/symbol"))
+sys.path.insert(0, os.path.join(curr_path, "../../example/image-classification/symbols"))
 import mxnet as mx
 import logging
 import argparse
diff --git a/tools/caffe_converter/.gitignore b/tools/caffe_converter/.gitignore
new file mode 100644
index 000000000000..0447b0d4ac3a
--- /dev/null
+++ b/tools/caffe_converter/.gitignore
@@ -0,0 +1 @@
+model/
diff --git a/tools/caffe_converter/Makefile b/tools/caffe_converter/Makefile
index 71b8b26e0575..d0698d70308a 100644
--- a/tools/caffe_converter/Makefile
+++ b/tools/caffe_converter/Makefile
@@ -1,4 +1,3 @@
-# find protoc
 ifndef PROTOC
 DEPS_PROTOC=../../deps/bin/protoc
 ifneq ("$(wildcard $(DEPS_PROTOC))","")
@@ -8,10 +7,10 @@ PROTOC = protoc
 endif
 endif
 
-all: caffe_parse/caffe_pb2.py
+all: caffe_pb2.py
 
 clean:
-	rm caffe_parse/caffe_pb2.py*
+	rm caffe_pb2.py*
 
-caffe_parse/caffe_pb2.py:
-	$(PROTOC) --python_out=./ ./caffe_parse/caffe.proto
+caffe_pb2.py:
+	$(PROTOC) --python_out=./ ./caffe.proto
diff --git a/tools/caffe_converter/README.md b/tools/caffe_converter/README.md
index 92cd2aa55019..ac88fa1dfe52 100644
--- a/tools/caffe_converter/README.md
+++ b/tools/caffe_converter/README.md
@@ -1,40 +1,13 @@
 # Convert Caffe Model to Mxnet Format
 
-### Build (Linux)
+This folder contains the source codes for this tool.
 
-Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.html) or [Google protobuf](https://developers.google.com/protocol-buffers/?hl=en) is required. The latter is often much easier to install:  
+If Caffe with python binding is installed, we can use the following command to
+convert a Resnet-50 pretrained model.
 
-1. We first install the protobuf compiler. If you compiled mxnet with `USE_DIST_KVSTORE = 1` then it is already built. Otherwise, install `protobuf-compiler` by your favor package manager, e.g. `sudo apt-get install protobuf-compiler` for ubuntu and `sudo yum install protobuf-compiler` for redhat/fedora. 
+```bash
+python convert_caffe_modelzoo.py resnet-50
+```
 
-2. Then install the protobuf's python binding. For example `sudo pip install protobuf`
-
-Now we can build the tool by running `make` in the current directory.
-
-### Build (Windows)
-
-Note: this tool currently only works on python 2.
-
-We must make sure that the installed python binding and protobuf compiler are using the same version of protobuf,
-so we install the bindings first, and then install the corresponding compiler.
-
-1. Install the protobuf bindings. At time of writing, the conda package manager has the most up to date version. Either run `conda install -c conda-forge protobuf` or `pip install protobuf`
-2. Download the win32 build of protoc from [Protocol Buffers Releases](https://github.com/google/protobuf/releases). Make sure to download the version that corresponds to the version of the bindings. Extract to any location then add that location to your `PATH`
-3. Run `make_win32.bat` to build the package
-
-
-### How to use
-
-Linux: Use `./run.sh model_name` to download and convert a model. E.g. `./run.sh vgg19`
-
-Windows: Use `python convert_model.py prototxt caffemodel outputprefix`  
-For example: `python convert_model.py VGG_ILSVRC_16_layers_deploy.prototxt VGG_ILSVRC_16_layers.caffemodel vgg16`
-
-
-### Note
-
-* We have verified the results of VGG_16/VGG_19 model and BVLC_googlenet results from Caffe model zoo.
-* The tool only supports single input and single output network.
-* The tool can only work with the L2LayerParameter in Caffe.
-* Caffe uses a convention for multi-strided pooling output shape inconsistent with MXNet
-    * This importer doesn't handle this problem properly yet
-    * And example of this failure is importing bvlc_Googlenet. The user needs to add padding to stride-2 pooling to make this work right now.
\ No newline at end of file
+Please refer to
+[docs/how_to/caffe.md](../../docs/how_to/caffe.md) for more details.
diff --git a/tools/caffe_converter/caffe_parse/caffe.proto b/tools/caffe_converter/caffe.proto
similarity index 100%
rename from tools/caffe_converter/caffe_parse/caffe.proto
rename to tools/caffe_converter/caffe.proto
diff --git a/tools/caffe_converter/caffe_parser.py b/tools/caffe_converter/caffe_parser.py
new file mode 100644
index 000000000000..9fa0c8804067
--- /dev/null
+++ b/tools/caffe_converter/caffe_parser.py
@@ -0,0 +1,61 @@
+import re
+try:
+    import caffe
+    from caffe.proto import caffe_pb2
+    use_caffe = True
+except ImportError:
+    try:
+        import caffe_pb2
+    except ImportError:
+        raise ImportError('You used to compile with protoc --python_out=./ ./caffe.proto')
+    use_caffe = False
+
+from google.protobuf import text_format
+
+def read_prototxt(fname):
+    """Return a caffe_pb2.NetParameter object that defined in a prototxt file
+    """
+    proto = caffe_pb2.NetParameter()
+    with open(fname, 'r') as f:
+        text_format.Merge(str(f.read()), proto)
+    return proto
+
+def get_layers(proto):
+    """Returns layers in a caffe_pb2.NetParameter object
+    """
+    if len(proto.layer):
+        return proto.layer
+    elif len(proto.layers):
+        return proto.layers
+    else:
+        raise ValueError('Invalid proto file.')
+
+def read_caffemodel(prototxt_fname, caffemodel_fname):
+    """Return a caffe_pb2.NetParameter object that defined in a binary
+    caffemodel file
+    """
+    if use_caffe:
+        caffe.set_mode_cpu()
+        net = caffe.Net(prototxt_fname, caffemodel_fname, caffe.TEST)
+        layer_names = net._layer_names
+        layers = net.layers
+        return (layers, layer_names)
+    else:
+        proto = caffe_pb2.NetParameter()
+        with open(caffemodel_fname, 'rb') as f:
+            proto.ParseFromString(f.read())
+        return (get_layers(proto), None)
+
+def layer_iter(layers, layer_names):
+    if use_caffe:
+        for layer_idx, layer in enumerate(layers):
+            layer_name = re.sub('[-/]', '_', layer_names[layer_idx])
+            layer_type = layer.type
+            layer_blobs = layer.blobs
+            yield (layer_name, layer_type, layer_blobs)
+    else:
+        for layer in layers:
+            layer_name = re.sub('[-/]', '_', layer.name)
+            layer_type = layer.type
+            layer_blobs = layer.blobs
+            yield (layer_name, layer_type, layer_blobs)
diff --git a/tools/caffe_converter/convert_caffe_modelzoo.py b/tools/caffe_converter/convert_caffe_modelzoo.py
new file mode 100644
index 000000000000..8066aab1216d
--- /dev/null
+++ b/tools/caffe_converter/convert_caffe_modelzoo.py
@@ -0,0 +1,114 @@
+import os
+import requests
+import argparse
+import logging
+from convert_model import convert_model
+from convert_mean import convert_mean
+import mxnet as mx
+
+_mx_caffe_model = 'http://data.mxnet.io/models/imagenet/test/caffe/'
+"""Dictionary for model meta information
+
+For each model, it requires three attributes:
+
+  - prototxt: URL for the deploy prototxt file
+  - caffemodel: URL for the binary caffemodel
+  - mean : URL for the data mean or a tuple of float
+
+Optionly it takes
+
+  - top-1-acc : top 1 accuracy for testing
+  - top-5-acc : top 5 accuracy for testing
+"""
+model_meta_info = {
+    'bvlc_alexnet' : {
+        'prototxt' : 'https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_googlenet/deploy.prototxt',
+        'caffemodel' : 'http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel',
+        'mean' : 'https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/caffe/imagenet_mean.binaryproto',
+        'top-1-acc' : 0.571,
+        'top-5-acc' : 0.802
+    },
+    'bvlc_googlenet' : {
+        'prototxt' : 'https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_googlenet/deploy.prototxt',
+        'caffemodel' : 'http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel',
+        'mean' : (123,117,104),
+        'top-1-acc' : 0.687,
+        'top-5-acc' : 0.889
+    },
+    'vgg-16' : {
+        'prototxt' : 'https://gist.githubusercontent.com/ksimonyan/211839e770f7b538e2d8/raw/c3ba00e272d9f48594acef1f67e5fd12aff7a806/VGG_ILSVRC_16_layers_deploy.prototxt',
+        # 'caffemodel' : 'http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_16_layers.caffemodel',
+        'caffemodel' : 'http://data.mxnet.io/models/imagenet/test/caffe/VGG_ILSVRC_16_layers.caffemodel',
+        'mean': (123.68,116.779,103.939),
+        'top-1-acc' : 0.734,
+        'top-5-acc' : 0.914
+    },
+    'vgg-19' : {
+        'prototxt' : 'https://gist.githubusercontent.com/ksimonyan/3785162f95cd2d5fee77/raw/bb2b4fe0a9bb0669211cf3d0bc949dfdda173e9e/VGG_ILSVRC_19_layers_deploy.prototxt',
+        # 'caffemodel' : 'http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_19_layers.caffemodel',
+        'caffemodel' : 'http://data.mxnet.io/models/imagenet/test/caffe/VGG_ILSVRC_19_layers.caffemodel',
+        'mean' : (123.68,116.779,103.939),
+        'top-1-acc' : 0.731,
+        'top-5-acc' : 0.913
+    },
+    'resnet-50' : {
+        'prototxt' : _mx_caffe_model+'ResNet-50-deploy.prototxt',
+        'caffemodel' : _mx_caffe_model+'ResNet-50-model.caffemodel',
+        'mean' : _mx_caffe_model+'ResNet_mean.binaryproto',
+        'top-1-acc' : 0.753,
+        'top-5-acc' : 0.922
+    },
+    'resnt-101' : {
+        'prototxt' : _mx_caffe_model+'ResNet-101-deploy.prototxt',
+        'caffemodel' : _mx_caffe_model+'ResNet-101-model.caffemodel',
+        'mean' : _mx_caffe_model+'ResNet_mean.binaryproto',
+        'top-1-acc' : 0.764,
+        'top-5-acc' : 0.929
+    },
+    'resnet-152' : {
+        'prototxt' : _mx_caffe_model+'ResNet-152-deploy.prototxt',
+        'caffemodel' : _mx_caffe_model+'ResNet-152-model.caffemodel',
+        'mean' : _mx_caffe_model+'ResNet_mean.binaryproto',
+        'top-1-acc' : 0.77,
+        'top-5-acc' : 0.933
+    },
+}
+
+def get_model_meta_info(model_name):
+    """returns a dict with model information"""
+    return dict(dict(model_meta_info)[model_name])
+
+def _download_caffe_model(model_name, meta_info, dst_dir='./model'):
+    """Download caffe model into disk by the given meta info """
+    if not os.path.isdir(dst_dir):
+        os.mkdir(dst_dir)
+    model_name = os.path.join(dst_dir, model_name)
+    assert 'prototxt' in meta_info, "missing prototxt url"
+    prototxt = mx.test_utils.download(meta_info['prototxt'], model_name+'_deploy.prototxt')
+    assert 'caffemodel' in meta_info, "mssing caffemodel url"
+    caffemodel = mx.test_utils.download(meta_info['caffemodel'], model_name+'.caffemodel')
+    assert 'mean' in meta_info, 'no mean info'
+    mean = meta_info['mean']
+    if isinstance(mean, str):
+        mean = mx.test_utils.download(mean, model_name+'_mean.binaryproto')
+    return (prototxt, caffemodel, mean)
+
+def convert_caffe_model(model_name, meta_info, dst_dir='./model'):
+    """Download, convert and save a caffe model"""
+
+    (prototxt, caffemodel, mean) = _download_caffe_model(model_name, meta_info, dst_dir)
+    model_name = os.path.join(dst_dir, model_name)
+    convert_model(prototxt, caffemodel, model_name)
+    if isinstance(mean, str):
+        mx_mean = model_name + '-mean.nd'
+        convert_mean(mean, mx_mean)
+        mean = mx_mean
+    return (model_name, mean)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Convert Caffe model zoo')
+    parser.add_argument('model_name', help='can be '+', '.join(model_meta_info.keys()))
+    args = parser.parse_args()
+    assert args.model_name in model_meta_info, 'Unknown model ' + args.model_name
+    model_name, _ = convert_caffe_model(args.model_name, model_meta_info[args.model_name])
+    print('Model is saved into '+model_name)
diff --git a/tools/caffe_converter/convert_mean.py b/tools/caffe_converter/convert_mean.py
new file mode 100644
index 000000000000..44f97f06321d
--- /dev/null
+++ b/tools/caffe_converter/convert_mean.py
@@ -0,0 +1,44 @@
+import mxnet as mx
+import numpy as np
+import argparse
+import caffe_parser
+
+def convert_mean(binaryproto_fname, output=None):
+    """Convert caffe mean
+
+    Parameters
+    ----------
+    binaryproto_fname : str
+        Filename of the mean
+    output : str, optional
+        Save the mean into mxnet's format
+
+    Returns
+    -------
+    NDArray
+        Mean in ndarray
+    """
+    mean_blob = caffe_parser.caffe_pb2.BlobProto()
+    with open(binaryproto_fname, 'rb') as f:
+        mean_blob.ParseFromString(f.read())
+
+    img_mean_np = np.array(mean_blob.data)
+    img_mean_np = img_mean_np.reshape(
+        mean_blob.channels, mean_blob.height, mean_blob.width
+    )
+    # swap channels from Caffe BGR to RGB
+    img_mean_np[[0,2],:,:] = img_mean_np[[2,0],:,:]
+    nd = mx.nd.array(img_mean_np)
+    if output is not None:
+        mx.nd.save(output, {"mean_image": nd})
+    return nd
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert caffe mean')
+    parser.add_argument('binaryproto_fname', help='Filename of the mean')
+    parser.add_argument('output', help='The name of the output file')
+    args = parser.parse_args()
+    convert_mean(args.binaryproto_fname, args.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index c3be57b6533f..83b1c24cd4fa 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -2,68 +2,49 @@
 import mxnet as mx
 import numpy as np
 import argparse
-import re
-from convert_symbol import proto2symbol
-
-caffe_flag = True
-try:
-    import caffe
-except ImportError:
-    import caffe_parse.parse_from_protobuf as parse
-
-    caffe_flag = False
-
-
-def get_caffe_iter(layer_names, layers):
-    for layer_idx, layer in enumerate(layers):
-        layer_name = re.sub('[-/]', '_', layer_names[layer_idx])
-        layer_type = layer.type
-        layer_blobs = layer.blobs
-        yield (layer_name, layer_type, layer_blobs)
-
-
-def get_iter(layers):
-    for layer in layers:
-        layer_name = re.sub('[-/]', '_', layer.name)
-        layer_type = layer.type
-        layer_blobs = layer.blobs
-        yield (layer_name, layer_type, layer_blobs)
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Caffe prototxt to mxnet model parameter converter.\
-                    Note that only basic functions are implemented. You are welcomed to contribute to this file.')
-    parser.add_argument('caffe_prototxt', help='The prototxt file in Caffe format')
-    parser.add_argument('caffe_model', help='The binary model parameter file in Caffe format')
-    parser.add_argument('save_model_name', help='The name of the output model prefix')
-    args = parser.parse_args()
-
-    prob, input_dim = proto2symbol(args.caffe_prototxt)
-
-    layers = ''
-    layer_names = ''
-
-    if caffe_flag:
-        caffe.set_mode_cpu()
-        net_caffe = caffe.Net(args.caffe_prototxt, args.caffe_model, caffe.TEST)
-        layer_names = net_caffe._layer_names
-        layers = net_caffe.layers
-    else:
-        layers = parse.parse_caffemodel(args.caffe_model)
-
-    arg_shapes, output_shapes, aux_shapes = prob.infer_shape(data=tuple(input_dim))
-    arg_names = prob.list_arguments()
+import sys
+import caffe_parser
+from convert_symbol import convert_symbol
+
+def convert_model(prototxt_fname, caffemodel_fname, output_prefix=None):
+    """Convert caffe model
+
+    Parameters
+    ----------
+
+    prototxt_fname : str
+         Filename of the prototxt model definition
+    caffemodel_fname : str
+         Filename of the binary caffe model
+    output_prefix : str, optinoal
+         If given, then save the converted MXNet into output_prefx+'.json' and
+         output_prefx+'.params'
+
+    Returns
+    -------
+    sym : Symbol
+         Symbol convereted from prototxt
+    arg_params : list of NDArray
+         Argument parameters
+    aux_params : list of NDArray
+         Aux parameters
+    input_dim : tuple
+         Input dimension
+    """
+    sym, input_dim = convert_symbol(prototxt_fname)
+    arg_shapes, output_shapes, aux_shapes = sym.infer_shape(data=tuple(input_dim))
+    arg_names = sym.list_arguments()
+    aux_names = sym.list_auxiliary_states()
     arg_shape_dic = dict(zip(arg_names, arg_shapes))
+    aux_shape_dic = dict(zip(aux_names, aux_shapes))
     arg_params = {}
-
-    iter = ''
-    if caffe_flag:
-        iter = get_caffe_iter(layer_names, layers)
-    else:
-        iter = get_iter(layers)
+    aux_params = {}
     first_conv = True
 
-    for layer_name, layer_type, layer_blobs in iter:
+    layers, names = caffe_parser.read_caffemodel(prototxt_fname, caffemodel_fname)
+    layer_iter = caffe_parser.layer_iter(layers, names)
+
+    for layer_name, layer_type, layer_blobs in layer_iter:
         if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14 \
                 or layer_type == 'PReLU':
             if layer_type == 'PReLU':
@@ -73,31 +54,38 @@ def main():
                 arg_params[weight_name] = mx.nd.zeros(wmat.shape)
                 arg_params[weight_name][:] = wmat
                 continue
-            assert (len(layer_blobs) == 2)
             wmat_dim = []
             if getattr(layer_blobs[0].shape, 'dim', None) is not None:
                 if len(layer_blobs[0].shape.dim) > 0:
                     wmat_dim = layer_blobs[0].shape.dim
                 else:
-                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height,
-                                layer_blobs[0].width]
+                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height, layer_blobs[0].width]
             else:
                 wmat_dim = list(layer_blobs[0].shape)
             wmat = np.array(layer_blobs[0].data).reshape(wmat_dim)
-            bias = np.array(layer_blobs[1].data)
+
             channels = wmat_dim[1]
             if channels == 3 or channels == 4:  # RGB or RGBA
                 if first_conv:
-                    print('Swapping BGR of caffe into RGB in mxnet')
+                    # Swapping BGR of caffe into RGB in mxnet
                     wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
 
-            assert (wmat.flags['C_CONTIGUOUS'] is True)
-            assert (bias.flags['C_CONTIGUOUS'] is True)
-            print('converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape))
+            assert(wmat.flags['C_CONTIGUOUS'] is True)
+            sys.stdout.write('converting layer {0}, wmat shape = {1}'.format(layer_name, wmat.shape))
+            if len(layer_blobs) == 2:
+                bias = np.array(layer_blobs[1].data)
+                bias = bias.reshape((bias.shape[0], 1))
+                assert(bias.flags['C_CONTIGUOUS'] is True)
+                bias_name = layer_name + "_bias"
+                bias = bias.reshape(arg_shape_dic[bias_name])
+                arg_params[bias_name] = mx.nd.zeros(bias.shape)
+                arg_params[bias_name][:] = bias
+                sys.stdout.write(', bias shape = {}'.format(bias.shape))
+
+            sys.stdout.write('\n')
+            sys.stdout.flush()
             wmat = wmat.reshape((wmat.shape[0], -1))
-            bias = bias.reshape((bias.shape[0], 1))
             weight_name = layer_name + "_weight"
-            bias_name = layer_name + "_bias"
 
             if weight_name not in arg_shape_dic:
                 print(weight_name + ' not found in arg_shape_dic.')
@@ -106,19 +94,69 @@ def main():
             arg_params[weight_name] = mx.nd.zeros(wmat.shape)
             arg_params[weight_name][:] = wmat
 
-            bias = bias.reshape(arg_shape_dic[bias_name])
-            arg_params[bias_name] = mx.nd.zeros(bias.shape)
-            arg_params[bias_name][:] = bias
 
             if first_conv and (layer_type == 'Convolution' or layer_type == 4):
                 first_conv = False
 
-    model = mx.mod.Module(symbol=prob, label_names=['prob_label', ])
-    model.bind(data_shapes=[('data', tuple(input_dim))])
-    model.init_params(arg_params=arg_params, aux_params={})
+        elif layer_type == 'Scale':
+            bn_name = layer_name.replace('scale', 'bn')
+            gamma = layer_blobs[0].data
+            beta = layer_blobs[1].data
+            # beta = np.expand_dims(beta, 1)
+            beta_name = '{}_beta'.format(bn_name)
+            gamma_name = '{}_gamma'.format(bn_name)
+
+            beta = beta.reshape(arg_shape_dic[beta_name])
+            gamma = gamma.reshape(arg_shape_dic[gamma_name])
+            arg_params[beta_name] = mx.nd.zeros(beta.shape)
+            arg_params[gamma_name] = mx.nd.zeros(gamma.shape)
+            arg_params[beta_name][:] = beta
+            arg_params[gamma_name][:] = gamma
+
+            assert gamma.flags['C_CONTIGUOUS'] is True
+            assert beta.flags['C_CONTIGUOUS'] is True
+            print ('converting scale layer, beta shape = {}, gamma shape = {}'.format(beta.shape, gamma.shape))
+        elif layer_type == 'BatchNorm':
+            bn_name = layer_name
+            mean = layer_blobs[0].data
+            var = layer_blobs[1].data
+            moving_average_factor = layer_blobs[2].data
+            mean_name = '{}_moving_mean'.format(bn_name)
+            var_name = '{}_moving_var'.format(bn_name)
+            maf_name = '{}_momentum'.format(bn_name)
+            mean = mean.reshape(aux_shape_dic[mean_name])
+            var = var.reshape(aux_shape_dic[var_name])
+            aux_params[mean_name] = mx.nd.zeros(mean.shape)
+            aux_params[var_name] = mx.nd.zeros(var.shape)
+            arg_params[maf_name] = mx.nd.zeros(moving_average_factor.shape)
+            aux_params[mean_name][:] = mean
+            aux_params[var_name][:] = var
+            arg_params[maf_name][:] = moving_average_factor
+            assert var.flags['C_CONTIGUOUS'] is True
+            assert mean.flags['C_CONTIGUOUS'] is True
+            print ('converting batchnorm layer, mean shape = {}, var shape = {}'.format(mean.shape, var.shape))
+        else:
+            assert len(layer_blobs) == 0
+            print ('\tskipping layer {} of type {}'.format(layer_name, layer_type))
+
+    if output_prefix is not None:
+        model = mx.mod.Module(symbol=sym, label_names=['prob_label', ])
+        model.bind(data_shapes=[('data', tuple(input_dim))])
+        model.init_params(arg_params=arg_params, aux_params=aux_params)
+        model.save_checkpoint(output_prefix, 0)
+
+    return sym, arg_params, aux_params, input_dim
 
-    model.save_checkpoint(args.save_model_name, 1)
+def main():
+    parser = argparse.ArgumentParser(description='Caffe prototxt to mxnet model parameter converter.\
+                    Note that only basic functions are implemented. You are welcomed to contribute to this file.')
+    parser.add_argument('prototxt', help='The prototxt filename')
+    parser.add_argument('caffemodel', help='The binary caffemodel filename')
+    parser.add_argument('save_model_name', help='The name of the output model prefix')
+    args = parser.parse_args()
 
+    convert_model(args.prototxt, args.caffemodel, args.save_model_name)
+    print ('Saved model successfully to {}'.format(args.save_model_name))
 
 if __name__ == '__main__':
     main()
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index a89c556fcb31..86b4dfca2d80 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -1,37 +1,30 @@
 from __future__ import print_function
-from google.protobuf import text_format
 import argparse
 import re
-import sys
+import caffe_parser
 
-caffe_flag = True
-try:
-    import caffe
-    from caffe.proto import caffe_pb2
-except ImportError:
-    caffe_flag = False
-    import caffe_parse.caffe_pb2
-
-
-def read_proto_solver_file(file_path):
-    solver_config = ''
-    if caffe_flag:
-        solver_config = caffe.proto.caffe_pb2.NetParameter()
+def _get_input(proto):
+    """Get input size
+    """
+    layer = caffe_parser.get_layers(proto)
+    if len(proto.input_dim) > 0:
+        input_dim = proto.input_dim
+    elif len(proto.input_shape) > 0:
+        input_dim = proto.input_shape[0].dim
+    elif layer[0].type == "Input":
+        input_dim = layer[0].input_param.shape[0].dim
+        layer.pop(0)
     else:
-        solver_config = caffe_parse.caffe_pb2.NetParameter()
-    return read_proto_file(file_path, solver_config)
-
-
-def read_proto_file(file_path, parser_object):
-    file = open(file_path, "r")
-    if not file:
-        raise Exception("ERROR (" + file_path + ")!")
-    text_format.Merge(str(file.read()), parser_object)
-    file.close()
-    return parser_object
+        raise ValueError('Cannot find input size')
 
+    assert layer[0].type != "Input", 'only support single input'
+    # We assume the first bottom blob of first layer is the output from data layer
+    input_name = layer[0].bottom[0]
+    return input_name, input_dim, layer
 
-def conv_param_to_string(param):
+def _convert_conv_param(param):
+    """Convert convolution layer parameter from Caffe to MXNet
+    """
     pad = 0
     if isinstance(param.pad, int):
         pad = param.pad
@@ -60,67 +53,61 @@ def conv_param_to_string(param):
         param_string += ", dilate=(%d, %d)" % (dilate, dilate)
     return param_string
 
+def _convert_pooling_param(param):
+    """Convert the pooling layer parameter
+    """
+    param_string = "pooling_convention='full', "
+    if param.global_pooling:
+        param_string += "global_pool=True, kernel=(1,1)"
+    else:
+        param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" % (
+            param.pad, param.pad, param.kernel_size, param.kernel_size,
+            param.stride, param.stride)
+    if param.pool == 0:
+        param_string += ", pool_type='max'"
+    elif param.pool == 1:
+        param_string += ", pool_type='avg'"
+    else:
+        raise ValueError("Unknown Pooling Method!")
+    return param_string
+
+def _parse_proto(prototxt_fname):
+    """Parse Caffe prototxt into symbol string
+    """
+    proto = caffe_parser.read_prototxt(prototxt_fname)
+
+    # process data layer
+    input_name, input_dim, layer = _get_input(proto)
+    # only support single input, so always use `data` as the input data
+    mapping = {input_name: 'data'}
+    need_flatten = {input_name: False}
+    symbol_string = "import mxnet as mx\n" \
+                    + "data = mx.symbol.Variable(name='data')\n";
 
-def proto2script(proto_file):
-    proto = read_proto_solver_file(proto_file)
     connection = dict()
     symbols = dict()
     top = dict()
     flatten_count = 0
-    symbol_string = ""
-    layer = ''
-    if len(proto.layer):
-        layer = proto.layer
-    elif len(proto.layers):
-        layer = proto.layers
-    else:
-        raise Exception('Invalid proto file.')
-        # Get input size to network
-    input_dim = [1, 3, 224, 224]  # default
-    if len(proto.input_dim) > 0:
-        input_dim = proto.input_dim
-    elif len(proto.input_shape) > 0:
-        input_dim = proto.input_shape[0].dim
-    elif layer[0].type == "Input":
-        input_dim = layer[0].input_param.shape._values[0].dim
-        layer.pop(0)
-    else:
-        raise Exception('Invalid proto file.')
-
-        # We assume the first bottom blob of first layer is the output from data layer
-    input_name = layer[0].bottom[0]
     output_name = ""
-    mapping = {input_name: 'data'}
-    need_flatten = {input_name: False}
+    prev_name = None
+
+    # convert reset layers one by one
     for i in range(len(layer)):
         type_string = ''
         param_string = ''
+        skip_layer = False
         name = re.sub('[-/]', '_', layer[i].name)
         if layer[i].type == 'Convolution' or layer[i].type == 4:
             type_string = 'mx.symbol.Convolution'
-            param_string = conv_param_to_string(layer[i].convolution_param)
+            param_string = _convert_conv_param(layer[i].convolution_param)
             need_flatten[name] = True
         if layer[i].type == 'Deconvolution' or layer[i].type == 39:
             type_string = 'mx.symbol.Deconvolution'
-            param_string = conv_param_to_string(layer[i].convolution_param)
+            param_string = _convert_conv_param(layer[i].convolution_param)
             need_flatten[name] = True
         if layer[i].type == 'Pooling' or layer[i].type == 17:
             type_string = 'mx.symbol.Pooling'
-            param = layer[i].pooling_param
-            param_string = ''
-            param_string += "pooling_convention='full', "
-            if param.global_pooling:
-                # there must be a param `kernel` in a pooling layer
-                param_string += "global_pool=True, kernel=(1,1)"
-            else:
-                param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" % \
-                                (param.pad, param.pad, param.kernel_size, param.kernel_size, param.stride, param.stride)
-            if param.pool == 0:
-                param_string += ", pool_type='max'"
-            elif param.pool == 1:
-                param_string += ", pool_type='avg'"
-            else:
-                raise Exception("Unknown Pooling Method!")
+            param_string = _convert_pooling_param(layer[i].pooling_param)
             need_flatten[name] = True
         if layer[i].type == 'ReLU' or layer[i].type == 18:
             type_string = 'mx.symbol.Activation'
@@ -137,13 +124,14 @@ def proto2script(proto_file):
         if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
             param = layer[i].lrn_param
-            param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" % \
-                           (param.alpha, param.beta, param.k, param.local_size)
+            param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" % (
+                param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
         if layer[i].type == 'InnerProduct' or layer[i].type == 14:
             type_string = 'mx.symbol.FullyConnected'
             param = layer[i].inner_product_param
-            param_string = "num_hidden=%d, no_bias=%s" % (param.num_output, not param.bias_term)
+            param_string = "num_hidden=%d, no_bias=%s" % (
+                param.num_output, not param.bias_term)
             need_flatten[name] = False
         if layer[i].type == 'Dropout' or layer[i].type == 6:
             type_string = 'mx.symbol.Dropout'
@@ -156,7 +144,7 @@ def proto2script(proto_file):
             type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
         if layer[i].type == 'Split' or layer[i].type == 22:
-            type_string = 'split'
+            type_string = 'split'  # will process later
         if layer[i].type == 'Concat' or layer[i].type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
@@ -167,58 +155,87 @@ def proto2script(proto_file):
         if layer[i].type == 'BatchNorm':
             type_string = 'mx.symbol.BatchNorm'
             param = layer[i].batch_norm_param
-            param_string = 'use_global_stats=%s' % param.use_global_stats
+            param_string = 'use_global_stats=%s, fix_gamma=False' % param.use_global_stats
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+        if layer[i].type == 'Scale':
+            assert layer[i-1].type == 'BatchNorm'
+            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
+            skip_layer = True
+            prev_name = re.sub('[-/]', '_', layer[i-1].name)
         if layer[i].type == 'PReLU':
             type_string = 'mx.symbol.LeakyReLU'
             param = layer[i].prelu_param
             param_string = "act_type='prelu', slope=%f" % param.filler.value
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
-        if type_string == '':
-            raise Exception('Unknown Layer %s!' % layer[i].type)
-        if type_string != 'split':
+        if layer[i].type == 'Eltwise':
+            type_string = 'mx.symbol.broadcast_add'
+            param_string = ""
+            need_flatten[name] = False
+        if layer[i].type == 'Reshape':
+            type_string = 'mx.symbol.Reshape'
+            need_flatten[name] = False
+            param = layer[i].reshape_param
+            param_string = "shape=(%s)" % (','.join(param.shape.dim),)
+
+        if skip_layer:
+            assert len(layer[i].bottom) == 1
+            symbol_string += "%s = %s\n" % (name, prev_name)
+        elif type_string == '':
+            raise ValueError('Unknown layer %s!' % layer[i].type)
+        elif type_string != 'split':
             bottom = layer[i].bottom
             if param_string != "":
                 param_string = ", " + param_string
             if len(bottom) == 1:
                 if need_flatten[mapping[bottom[0]]] and type_string == 'mx.symbol.FullyConnected':
                     flatten_name = "flatten_%d" % flatten_count
-                    symbol_string += "%s=mx.symbol.Flatten(name='%s', data=%s)\n" % \
-                                     (flatten_name, flatten_name, mapping[bottom[0]])
+                    symbol_string += "%s=mx.symbol.Flatten(name='%s', data=%s)\n" % (
+                        flatten_name, flatten_name, mapping[bottom[0]])
                     flatten_count += 1
                     need_flatten[flatten_name] = False
                     bottom[0] = flatten_name
                     mapping[bottom[0]] = bottom[0]
-                symbol_string += "%s = %s(name='%s', data=%s %s)\n" % \
-                                 (name, type_string, name, mapping[bottom[0]], param_string)
+                symbol_string += "%s = %s(name='%s', data=%s %s)\n" % (
+                    name, type_string, name, mapping[bottom[0]], param_string)
             else:
-                symbol_string += "%s = %s(name='%s', *[%s] %s)\n" % \
-                                 (name, type_string, name, ','.join([mapping[x] for x in bottom]), param_string)
+                symbol_string += "%s = %s(name='%s', *[%s] %s)\n" % (
+                    name, type_string, name, ','.join([mapping[x] for x in bottom]), param_string)
         for j in range(len(layer[i].top)):
             mapping[layer[i].top[j]] = name
         output_name = name
     return symbol_string, output_name, input_dim
 
-
-def proto2symbol(proto_file):
-    sym, output_name, input_dim = proto2script(proto_file)
-    sym = "import mxnet as mx\n" \
-          + "data = mx.symbol.Variable(name='data')\n" \
-          + sym
+def convert_symbol(prototxt_fname):
+    """Convert caffe model definition into Symbol
+
+    Parameters
+    ----------
+    prototxt_fname : str
+        Filename of the prototxt file
+
+    Returns
+    -------
+    Symbol
+        Converted Symbol
+    tuple
+        Input shape
+    """
+    sym, output_name, input_dim = _parse_proto(prototxt_fname)
     exec(sym)
     _locals = locals()
     exec("ret = " + output_name, globals(), _locals)
     ret = _locals['ret']
     return ret, input_dim
 
-
 def main():
-    symbol_string, output_name, input_dim = proto2script(sys.argv[1])
-    if len(sys.argv) > 2:
-        with open(sys.argv[2], 'w') as fout:
-            fout.write(symbol_string)
-    else:
-        print(symbol_string)
-
+    parser = argparse.ArgumentParser(
+        description='Convert caffe prototxt into Symbol')
+    parser.add_argument('prototxt', help='The prototxt filename')
+    parser.add_argument('output', help='filename for the output json file')
+    args = parser.parse_args()
+
+    sym, _ = convert_symbol(args.prototxt)
+    sym.save(args.output)
 
 if __name__ == '__main__':
     main()
diff --git a/tools/caffe_converter/make_win32.bat b/tools/caffe_converter/make_win32.bat
index 7d354dcaeb6c..2f3367d000d4 100644
--- a/tools/caffe_converter/make_win32.bat
+++ b/tools/caffe_converter/make_win32.bat
@@ -1,3 +1,3 @@
-@protoc --python_out=./ ./caffe_parse/caffe.proto
+@protoc --python_out=./ ./caffe.proto
 @echo done.
 @pause
diff --git a/tools/caffe_converter/test_converter.py b/tools/caffe_converter/test_converter.py
new file mode 100644
index 000000000000..0970ffe95268
--- /dev/null
+++ b/tools/caffe_converter/test_converter.py
@@ -0,0 +1,49 @@
+"""Test converted models
+"""
+import os, sys
+curr_path = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(os.path.join(curr_path, "../../example/image-classification"))
+
+from test_score import download_data
+from score import score
+from convert_caffe_modelzoo import convert_caffe_model, get_model_meta_info
+import logging
+logging.basicConfig(level=logging.DEBUG)
+
+import mxnet as mx
+
+def test_imagenet_model(model_name, val_data, gpus, batch_size):
+    logging.info('test %s', model_name)
+    meta_info = get_model_meta_info(model_name)
+    [model_name, mean] = convert_caffe_model(model_name, meta_info)
+    sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
+    acc = [mx.metric.create('acc'), mx.metric.create('top_k_accuracy', top_k = 5)]
+    if isinstance(mean, str):
+        mean_args = {'mean_img':mean}
+    else:
+        mean_args = {'rgb_mean':','.join([str(i) for i in mean])}
+
+    (speed,) = score(model=(sym, arg_params, aux_params),
+                     data_val=val,
+                     label_name = 'prob_label',
+                     metrics=acc,
+                     gpus=gpus,
+                     batch_size=batch_size,
+                     max_num_examples=500,
+                     **mean_args)
+    logging.info('speed : %f image/sec', speed)
+    for a in acc:
+        logging.info(a.get())
+    assert acc[0].get()[1] > meta_info['top-1-acc'] - 0.3
+    assert acc[1].get()[1] > meta_info['top-5-acc'] - 0.3
+
+if __name__ == '__main__':
+    gpus = mx.test_utils.list_gpus()
+    assert len(gpus) > 0
+    batch_size = 32 * len(gpus)
+
+    models = ['bvlc_googlenet', 'vgg-16', 'resnet-50']
+
+    val = download_data()
+    for m in models:
+        test_imagenet_model(m, val, ','.join([str(i) for i in gpus]), batch_size)
diff --git a/tools/cfn/Readme.md b/tools/cfn/Readme.md
index 522e12043cee..677a1826fbb7 100644
--- a/tools/cfn/Readme.md
+++ b/tools/cfn/Readme.md
@@ -1,131 +1,2 @@
-# **Distributed Deep Learning Made Easy**
-**Authors**: Naveen Swamy, Joseph Spisak
-
-Machine learning is a field of computer science that enables computers to learn without being explicitly programmed. It focuses on algorithms that can learn from and make predictions on data. 
-
-Most recently, one branch of machine learning, called deep learning, has been deployed successfully in production with higher accuracy than traditional techniques, enabling capabilities such as speech recognition, image recognition, and video analytics. This higher accuracy comes, however, at the cost of significantly higher compute requirements for training these deep models. 
-
-One of the major reasons for this rebirth and rapid progress is the availability and democratization of cloud-scale computing. Training state-of-the-art deep neural networks can be time-consuming, with larger networks like [ResidualNet](https://arxiv.org/abs/1512.03385) taking several days to weeks to train, even on the latest GPU hardware. Because of this, a scale-out approach is required.  
-
-Accelerating training time has multiple benefits, including:  
-
-* Enabling faster iterative research, allowing scientists to push the state of the art faster in domains such as computer vision or speech recognition. 
-* Reducing the time-to-market for intelligent applications, allowing AI applications that consume trained, deep learning models to access newer models faster.
-* Absorbing new data faster, helping to keep deep learning models current.
-
-[AWS CloudFormation](https://aws.amazon.com/cloudformation), which creates and configures Amazon Web Services resources with a template, simplifies the process of setting up a distributed deep learning cluster. The CloudFormation Deep Learning template uses the [Amazon Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B01M0AXXQB) (supporting MXNet, TensorFlow, Caffe, Theano, Torch, and CNTK frameworks) to launch a cluster of [Amazon EC2](https://aws.amazon.com/ec2) instances and other AWS resources needed to perform distributed deep learning. CloudFormation creates all resources in the customer account.  
-
-# EC2 Cluster Architecture 
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide1.png)
-
-# Resources created by the Deep Learning template
-The Deep Learning template creates a stack that contains the following resources:  
-
-* A VPC in the customer account. 
-* The requested number of worker instances in an [Auto Scaling](https://aws.amazon.com/autoscaling) group within the VPC. These worker instances are launched in a private subnet. 
-* A master instance in a separate Auto Scaling group that acts as a proxy to enable connectivity to the cluster via SSH. CloudFormation places this instance within the VPC and connects it to both the public and private subnets. This instance has both public IP addresses and DNS. 
-* A security group that allows external SSH access to the master instance. 
-* Two security groups that open ports on the private subnet for communication between the master and workers. 
-* An [IAM](https://aws.amazon.com/iam) role that allows users to access and query Auto Scaling groups and the private IP addresses of the EC2 instances. 
-* A NAT gateway used by the instances within the VPC to talk to the outside world. 
-
-The startup script enables SSH forwarding on all hosts. Enabling SSH is essential because frameworks such as MXNet makes use of SSH for communication between master and worker instances during distributed training. The startup script queries the private IP addresses of all the hosts in the stack, appends the IP address and worker alias to /etc/hosts, and writes the list of worker aliases to /opt/deeplearning/workers.  
-
-The startup script sets up the following environment variables: 
-
-* **$DEEPLEARNING_WORKERS_PATH**: The file path that contains the list of workers  
-
-* **$DEEPLEARNING_WORKERS_COUNT**: The total number of workers  
-
-* **$DEEPLEARNING_WORKER_GPU_COUNT**: The number of GPUs on the instance  
-
-# Launch a CloudFormation Stack
-**Note:**  To scale to the desired number of instances beyond the [default limit](https://aws.amazon.com/ec2/faqs/#How_many_instances_can_I_run_in_Amazon_EC2), file a [support request](https://aws.amazon.com/contact-us/ec2-request).
-
-1. Download the Deep Learning template from the [MXNet](https://github.com/dmlc/mxnet/tree/master/tools/cfn) GitHub repo.
-
-2. Open the [CloudFormation console](https://console.aws.amazon.com/cloudformation), and then choose **Create New Stack**. 
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide2.png)  
-
-3. Choose **Choose File** to upload the template, and then choose **Next**:
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide3.png)  
-
-4. For **Stack name**, enter a descriptive stack name.
-
-5. Choose a GPU **InstanceType**, such as a [P2.16xlarge](https://aws.amazon.com/ec2/instance-types/p2/).  
-
-6. For **KeyName**, choose an EC2 key pair.  
-
-7. For **SSHLocation**, choose a valid CIDR IP address range to allow SSH access to the master instance and stack.  
-
-8. For **Worker Count**, type a value. The stack provisions the worker count + 1, with the additional instance acting as the master. The master also participates in the training/evaluation. Choose **Next**.
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide4.png)
-
-9. (Optional) Under **Tags**, type values for **Key** and **Value**. This allows you to assign metadata to your resources.
-   (Optional) Under **Permissions**, you can choose the IAM role that CloudFormation uses to create the stack. Choose **Next**.
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide5.png)
-
-10. Under **Capabilities**, select the checkbox to agree to allow CloudFormation to create an IAM role. An IAM role is required for correctly setting up a stack.
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide6.png)  
-
-11. To create the CloudFormation stack, choose **Create**
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide7.png)  
-
-12. To see the status of your stack, choose **Events**. If stack creation fails, for example, because of an access issue or an unsupported number of workers, troubleshoot the issue. For information about troubleshooting the creation of stacks, see [Troubleshooting AWS CloudFormation](http://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/troubleshooting.html). The event log records the reason for failure. 
-
-# Log in to the master instance.
-SSH agent forwarding securely connects the instances within the VPC that is connected to the private subnet. The idea is based on [Securely Connect to Linux Instances Running in a Private Amazon VPC.](https://aws.amazon.com/blogs/security/securely-connect-to-linux-instances-running-in-a-private-amazon-vpc/)
-
-1. **Find the public DNS/IP of the master.**  
-
-The CloudFormation stack **output** contains the Auto Scaling group in which the master instance is launched. Note the Auto Scaling group ID for MasterAutoScalingGroup.  
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tools/cfn/Slide8.png)
-
-  a. Open the [Amazon EC2 console](https://console.aws.amazon.com/ec2).  
-    
-  b. In the navigation pane, under **Auto Scaling**, choose **Auto Scaling Groups**.  
-
-  c. On the **Auto Scaling** page, search for the group ID and select it.  
-
-  d. On the **Instances** tab, find the instance ID of the master instance.  
-
-  e. Choose the instance to find the public DNS/IP address used for login.  
-
-2. **Enable SSH agent forwarding.**
-
-This enables communication with all instances in the private subnet. Using the DNS/IP address from Step 1, modify the SSH configuration to include these lines: 
-
-    Host IP/DNS-from-above  
-    ForwardAgent yes
-
-3. **Run MXNet distributed training.**  
-
-The following example shows how to run MNIST with data parallelism. Note the use of the DEEPLEARNING_* environment variables:  
-
-	#terminate all running Python processes across workers 
-	while read -u 10 host; do ssh $host "pkill -f python" ; done 10<$DEEPLEARNING_WORKERS_PATH  
-	
-	#navigate to the mnist image-classification example directory  
-	cd ~/src/mxnet/example/image-classification  
-	
-	#run the MNIST distributed training example  
-	../../tools/launch.py -n $DEEPLEARNING_WORKERS_COUNT -H $DEEPLEARNING_WORKERS_PATH python train_mnist.py --gpus $(seq -s , 0 1 $(($DEEPLEARNING_WORKER_GPU_COUNT - 1))) --network lenet --kv-store dist_sync
-
-These steps are only a subset. For more information about running distributed training, see [Run MXNet on Multiple Devices](http://mxnet.readthedocs.io/en/latest/how_to/multi_devices.html). 
-
-#FAQ
-
-###1. How do I change the IP addresses that are allowed to SSH to the master instance?
-The CloudFormation stack output contains the security group that controls the inbound IP addresses for SSH access to the master instance. Use this security group to change your inbound IP addresses.  
-
-###2. When an instance is replaced, are the IP addresses of the instances updated? 
-No. You must update IP addresses manually.  
-
-###3. Does the master instance participate in training/validation?
-Yes. Because most deep learning tasks involve GPUs, the master instance acts both as a proxy and as a distributed training/validation instance.
-
-###4. Why are the instances in an Auto Scaling group? 
-[Auto Scaling](https://aws.amazon.com/autoscaling/) group maintains the number of desired instances by launching a new instance if an existing instance fails. There are two Auto Scaling groups: one for the master and one for the workers in the private subnet. Because only the master instance has a public endpoint to access the hosts in the stack, if the master instance becomes unavailable, you can terminate it and the associated Auto Scaling group automatically launches a new master instance with a new public endpoint. 
-
-###5. When a new worker instance is added or an existing instance replaced, does CloudFormation update the IP addresses on the master instance?
-No, this template does not have the capability to automatically update the IP address of the replacement instance.
+**Distributed Deep Learning Made Easy has found more love and new home, please visit  
+[awslabs/deeplearning-cfn](https://github.com/awslabs/deeplearning-cfn)**
\ No newline at end of file
diff --git a/tools/cfn/deeplearning.template b/tools/cfn/deeplearning.template
deleted file mode 100644
index b5ef6026ca01..000000000000
--- a/tools/cfn/deeplearning.template
+++ /dev/null
@@ -1,526 +0,0 @@
-{
-  "AWSTemplateFormatVersion" : "2010-09-09",
-  "Description" : "Launches a Deep Learning Cluster with one Master and variable number of Workers.",
-  "Parameters" : {
-    "KeyName" : {
-      "Description" : "Name of an existing Amazon EC2 KeyPair to enable SSH access to the instances",
-      "Type" : "AWS::EC2::KeyPair::KeyName"
-    },
-    "WorkerCount" : {
-      "Description" : "The number of worker instances (launches +1 instance for the Master).",
-      "Type" : "Number",
-      "MinValue" : "1",
-      "Default" : "1"
-    },
-    "InstanceType" : {
-      "Description" : "The EC2 instance type for all instances.",
-      "Type" : "String",
-      "Default" : "g2.2xlarge",
-      "AllowedValues" : [ "g2.2xlarge", "g2.8xlarge", "p2.xlarge", "p2.8xlarge", "p2.16xlarge" ],
-      "ConstraintDescription" : "Must be a valid GPU EC2 instance type."
-    },
-    "SSHLocation": {
-      "Description": "Restrict SSH access to a valid CIDR range, this should be a valid CIDR IP address range that you want to allow access to your Master and Stack.",
-      "Type": "String",
-      "MinLength": "9",
-      "MaxLength": "18",
-      "AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
-      "ConstraintDescription": "Must be a valid CIDR range of the form x.x.x.x/x"
-    }
-  },
-  "Mappings" : {
-    "AmazonLinux" : {
-      "us-east-1"      : { "AMI" : "ami-7d93bf6a"  },
-      "us-west-2"      : { "AMI" : "ami-8b08abeb"  },
-      "eu-west-1"      : { "AMI" : "ami-4844153b"  }
-    }, 
-    "SubnetConfig" : {
-      "VPC"     : { "CIDR" : "10.0.0.0/16" },
-      "Public"  : { "CIDR" : "10.0.0.0/24" },
-      "Private" : { "CIDR" : "10.0.1.0/24" }
-    }
-  },
-  "Resources" : {
-    "InstanceRole" : {
-      "Type" : "AWS::IAM::Role",
-      "Properties" : {
-        "AssumeRolePolicyDocument" : {
-          "Statement" : [ {
-            "Effect" : "Allow",
-            "Principal" : {
-              "Service" : [ "ec2.amazonaws.com" ]
-            },
-            "Action" : [ "sts:AssumeRole" ]
-          } ]
-        },
-        "Path" : "/",
-        "Policies" : [ {
-          "PolicyName" : "instance",
-          "PolicyDocument" : {
-            "Statement" : [ {
-              "Effect" : "Allow",
-              "Action" : [ "autoscaling:DescribeAutoScalingGroups", "autoscaling:DescribeAutoScalingInstances", "ec2:DescribeInstances", "cloudformation:DescribeStackResource"],
-              "Resource" : "*"
-            } ]
-          }
-        } ]
-      }
-    },
-    "InstanceProfile" : {
-      "Type" : "AWS::IAM::InstanceProfile",
-      "DependsOn" : "InstanceRole",
-      "Properties" : {
-        "Path" : "/",
-        "Roles" : [ {
-          "Ref" : "InstanceRole"
-        } ]
-      }
-    },
-    "AdminSSHSecurityGroup" : {
-      "Type" : "AWS::EC2::SecurityGroup",
-      "Properties" : {
-        "GroupDescription" : "Security group that controls SSH access to the Master instance.",
-        "VpcId" : { "Ref" : "Vpc" },
-          "Tags" : [
-            { "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_SSH" ] ] } }
-          ],
-        "SecurityGroupIngress" : [ 
-            { "IpProtocol" : "tcp", "FromPort" : "22",  "ToPort" : "22",  "CidrIp" : { "Ref" : "SSHLocation" } }
-          ],          
-        "SecurityGroupEgress" : [
-         ]
-      }
-    },        
-    "MasterSecurityGroup" : {
-      "Type" : "AWS::EC2::SecurityGroup",
-      "Properties" : {
-        "GroupDescription" : "Enable Port access to and from the Master on the Private Interface.",
-        "VpcId" : { "Ref" : "Vpc" },
-          "Tags" : [
-            { "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_Master" ] ] } }
-          ],
-        "SecurityGroupIngress" : [
-          ],          
-        "SecurityGroupEgress" : [
-         ]      
-      }
-    },
-    "MasterSecurityIngress1" : {
-      "Type" : "AWS::EC2::SecurityGroupIngress",
-      "DependsOn" : ["MasterSecurityGroup"],      
-      "Properties" : {
-        "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
-         "IpProtocol" : "tcp", 
-         "FromPort" : "0", 
-         "ToPort" : "65535",
-         "SourceSecurityGroupId" :  { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }
-      }
-    },
-    "MasterSecurityIngress2" : {
-      "Type" : "AWS::EC2::SecurityGroupIngress",
-      "DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"],      
-      "Properties" : {
-        "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
-         "IpProtocol" : "icmp", 
-         "FromPort" : "-1", 
-         "ToPort" : "-1",
-         "SourceSecurityGroupId" :  { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] }
-      }
-    },
-    "MasterSecurityIngress3" : {
-      "Type" : "AWS::EC2::SecurityGroupIngress",
-      "DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"],      
-      "Properties" : {
-        "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
-         "IpProtocol" : "tcp", 
-         "FromPort" : "0", 
-         "ToPort" : "65535",
-         "SourceSecurityGroupId" :  { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
-      }
-    },    
-    "MasterSecurityIngress4" : {
-      "Type" : "AWS::EC2::SecurityGroupIngress",
-      "DependsOn" : ["MasterSecurityGroup", "WorkerSecurityGroup"],      
-      "Properties" : {
-        "GroupId" : { "Fn::GetAtt": [ "MasterSecurityGroup", "GroupId" ] },
-         "IpProtocol" : "icmp", 
-         "FromPort" : "-1", 
-         "ToPort" : "-1",
-         "SourceSecurityGroupId" :  { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
-      }
-    },        
-    "WorkerSecurityGroup" : {
-      "Type" : "AWS::EC2::SecurityGroup",
-      "DependsOn" : ["MasterSecurityGroup"],
-      "Properties" : {
-        "GroupDescription" : "Enable Port access to and from the Worker on the Private Interface",
-        "VpcId" : { "Ref" : "Vpc" },
-          "Tags" : [
-            { "Key" : "Name", "Value" : {"Fn::Join" : ["", [{ "Ref" : "AWS::StackName" }, "_Worker"] ]} }
-          ],                 
-        "SecurityGroupIngress" : [
-            { "IpProtocol" : "tcp", "FromPort" : "0", "ToPort" : "65535", "SourceSecurityGroupId" : { "Ref" : "MasterSecurityGroup" } },
-            { "IpProtocol" : "icmp", "FromPort" : "-1", "ToPort" : "-1", "SourceSecurityGroupId" : { "Ref" : "MasterSecurityGroup" } }
-          ],
-        "SecurityGroupEgress" : [
-        ]
-      }
-    },
-    "WorkerSecurityIngress3" : {
-      "Type" : "AWS::EC2::SecurityGroupIngress",
-      "DependsOn" : ["WorkerSecurityGroup"],      
-      "Properties" : {
-        "GroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] },
-         "IpProtocol" : "tcp", 
-         "FromPort" : "0", 
-         "ToPort" : "65535",
-         "SourceSecurityGroupId" :  { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
-      }
-    },
-    "WorkerSecurityIngress4" : {
-      "Type" : "AWS::EC2::SecurityGroupIngress",
-      "DependsOn" : ["WorkerSecurityGroup"],      
-      "Properties" : {
-        "GroupId" : { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] },
-         "IpProtocol" : "icmp", 
-         "FromPort" : "-1", 
-         "ToPort" : "-1",
-         "SourceSecurityGroupId" :  { "Fn::GetAtt": [ "WorkerSecurityGroup", "GroupId" ] }
-      }
-    },    
-    "WorkerLaunchConfig" : {
-      "Type" : "AWS::AutoScaling::LaunchConfiguration",
-      "Properties" : {
-        "ImageId" : {
-          "Fn::FindInMap" : [ "AmazonLinux", { "Ref" : "AWS::Region" }, "AMI" ]
-        },
-        "InstanceType" : {
-          "Ref" : "InstanceType"
-        },
-        "IamInstanceProfile" : {
-          "Ref" : "InstanceProfile"
-        },
-        "SecurityGroups" : [
-          {"Ref" : "WorkerSecurityGroup"} 
-        ],
-        "UserData" : {
-          "Fn::Base64" : {
-            "Fn::Join" : [ "", 
-              [
-                "#!/bin/bash -xe",
-                "\n",                
-
-                "# setup ssh-forwarding. ",
-                "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config",
-                "\n",
-                ""
-              ]
-            ]
-          }
-        },                   
-        "KeyName" : {
-          "Ref" : "KeyName"
-        }        
-      }
-    },
-    "MasterLaunchConfig" : {
-      "Type" : "AWS::AutoScaling::LaunchConfiguration",
-      "Properties" : {
-        "AssociatePublicIpAddress" : "true",
-        "ImageId" : {
-          "Fn::FindInMap" : [ "AmazonLinux", { "Ref" : "AWS::Region" }, "AMI" ]
-        },
-        "InstanceType" : {
-          "Ref" : "InstanceType"
-        },
-        "IamInstanceProfile" : {
-          "Ref" : "InstanceProfile"
-        },
-        "SecurityGroups" : [
-          { "Ref" : "MasterSecurityGroup" }, 
-          { "Ref" : "AdminSSHSecurityGroup" }
-        ],
-        "UserData" : {
-          "Fn::Base64" : {
-            "Fn::Join" : [ "", 
-              [
-                "#!/bin/bash -xe",
-                "\n",
-                "# setup ssh-forwarding. \n",
-                "sed -i \"s/^#\\(\\s\\+\\)ForwardAgent\\(\\s\\+\\)no/\\ \\1ForwardAgent\\2yes/g\" /etc/ssh/ssh_config",
-                "\n",
-
-                "mkdir -p /opt/deeplearning",
-                "\n",
-
-                "# run cfn-init. \n",
-                "export CFN_PATH=\\/opt\\/aws\\/bin",
-                "\n",
-                "$CFN_PATH\\/cfn-init -v --region ", { "Ref" : "AWS::Region" },
-                " -s ",
-                { "Ref" : "AWS::StackId" },
-                " -r MasterLaunchConfig ",
-                "\n",
-                ""
-              ]
-            ]
-          }
-        },        
-        "KeyName" : {
-          "Ref" : "KeyName"
-        }        
-      },
-      "Metadata" : {
-        "AWS::CloudFormation::Init" : {
-          "config" : {
-            "commands" : {
-              "test" : {
-                  "command" : "/opt/deeplearning/fetch-hosts.sh"
-              }
-            },
-            "files" : {
-                "/opt/deeplearning/fetch-hosts.sh": {
-                "content" : { "Fn::Join" : ["", [
-                  "#!/bin/bash -xe",
-                  "\n",
-                  "# setup deep learning Master ip,dns alias",
-                  "\n",
-                  "num_instance=1",
-                  "\n",
-                  "instance=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" autoscaling describe-auto-scaling-groups --no-paginate --query \"AutoScalingGroups[?Tags[?Value=='",{"Ref" : "AWS::StackName"},"']]|[?Tags[?Key=='NodeType']]|[?Tags[?Value=='Master']].Instances[*].InstanceId\" --output text | tr \"\\t\" \"\\n\")",
-                  "\n",
-                  "ip=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" ec2 describe-instances --instance-ids $instance --output text --query \"Reservations[*].Instances[*].PrivateIpAddress\")",
-                  "\n",
-                  "echo \"$ip deeplearning-master\" >>/etc/hosts",
-                  "\n",
-                  "echo \"$ip deeplearning-worker$num_instance\" >>/etc/hosts",
-                  "\n",
-                  "echo \"deeplearning-worker$num_instance\" >>/opt/deeplearning/workers",
-                  "\n",
-                  "# setup deep learning workers ip,dns alias",
-                  "\n",
-                  "for instance in `aws --region \"", { "Ref" : "AWS::Region" }, "\" autoscaling describe-auto-scaling-groups --no-paginate --query \"AutoScalingGroups[?Tags[?Value=='",{"Ref" : "AWS::StackName"},"']]|[?Tags[?Key=='NodeType']]|[?Tags[?Value=='Worker']].Instances[*].InstanceId\" --output text | tr \"\\t\" \"\\n\"`",
-                  "\n",
-                  "do",
-                  "\n",
-                  "let \"num_instance += 1\"",
-                  "\n",
-                  "ip=$(aws --region \"", { "Ref" : "AWS::Region" }, "\" ec2 describe-instances --instance-ids $instance --output text --query \"Reservations[*].Instances[*].PrivateIpAddress\")",
-                  "\n",
-                  "echo \"$ip deeplearning-worker$num_instance\" >>/etc/hosts",
-                  "\n",
-                  "echo \"deeplearning-worker$num_instance\" >>/opt/deeplearning/workers",
-                  "\n",
-                  "done;",
-                  "\n",
-                  "# set deep learning environment variables",
-                  "\n",
-                  "echo \"export DEEPLEARNING_WORKERS_PATH=/opt/deeplearning/workers\" >>/etc/profile.d/deeplearning.sh",
-                  "\n",
-                  "echo \"export DEEPLEARNING_WORKERS_COUNT=$(wc -l < \\/opt\\/deeplearning\\/workers)\" >>/etc/profile.d/deeplearning.sh",
-                  "\n",
-                  "echo \"export DEEPLEARNING_WORKER_GPU_COUNT=$(nvidia-smi -L | wc -l)\" >>/etc/profile.d/deeplearning.sh",
-                  "\n",
-                  ""
-                  ]]},
-                "mode"  : "000544",
-                "owner" : "root",
-                "group" : "root"
-              }
-            }
-          }
-        }
-      }
-    },
-    "MasterAutoScalingGroup" : {
-      "Type" : "AWS::AutoScaling::AutoScalingGroup",
-      "DependsOn" : ["MasterLaunchConfig", "WorkerAutoScalingGroup"],
-      "Properties" : {
-        "DesiredCapacity" : "1",
-        "MinSize" : "1",
-        "MaxSize" : "1",
-        "LaunchConfigurationName" : { "Ref" : "MasterLaunchConfig"},
-        "VPCZoneIdentifier" : [{ "Ref" : "PublicSubnet"}],
-        "Tags" : [ {
-          "Key" : "Name",
-          "Value" : {
-            "Ref" : "AWS::StackName"
-          },          
-          "PropagateAtLaunch" : true
-          },
-          {
-          "Key" : "NodeType",
-          "Value" : "Master",
-          "PropagateAtLaunch" : true
-          }
-        ]
-      }
-    },
-    "WorkerAutoScalingGroup" : {
-      "Type" : "AWS::AutoScaling::AutoScalingGroup",
-      "DependsOn" : ["WorkerLaunchConfig"],
-      "Properties" : {
-        "DesiredCapacity" : {
-          "Ref" : "WorkerCount"
-        },
-        "MinSize" : {
-          "Ref" : "WorkerCount"
-        },
-        "MaxSize" : {
-          "Ref" : "WorkerCount"
-        },
-        "LaunchConfigurationName" : {
-          "Ref" : "WorkerLaunchConfig"
-        },
-        "VPCZoneIdentifier" : [ { "Ref" : "PrivateSubnet" } ],
-        "Tags" : [ {
-          "Key" : "Name",
-          "Value" : {
-            "Ref" : "AWS::StackName"
-          },          
-          "PropagateAtLaunch" : true
-          },
-          {
-          "Key" : "NodeType",
-          "Value" : "Worker",
-          "PropagateAtLaunch" : true
-          }
-        ]
-      }
-    },    
-    "NATGatewayEIP" : {
-      "Type" : "AWS::EC2::EIP",
-      "Properties" : {"Domain" : "vpc"}
-    },    
-    "Vpc" : {
-      "Type" : "AWS::EC2::VPC",
-      "Properties" : {
-          "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "VPC", "CIDR" ]},
-          "EnableDnsSupport" : "true",
-          "EnableDnsHostnames" : "true",
-          "Tags" : [
-            { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
-          ]          
-      }
-    },
-    "InternetGateway" : {
-      "Type" : "AWS::EC2::InternetGateway",
-      "Properties" : {
-        "Tags" : [
-          { "Key" : "Network", "Value" : "Public" },
-          { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
-        ]
-      }
-    },
-    "GatewayToInternet" : {
-       "Type" : "AWS::EC2::VPCGatewayAttachment",
-       "Properties" : {
-         "VpcId" : { "Ref" : "Vpc" },
-         "InternetGatewayId" : { "Ref" : "InternetGateway" }
-       }
-    },
-    "PublicSubnet" : {
-      "Type" : "AWS::EC2::Subnet",
-      "DependsOn" : ["PrivateSubnet"],
-      "Properties" : {
-        "VpcId" : {"Ref" : "Vpc"},
-        "AvailabilityZone" : { "Fn::GetAtt" : [ "PrivateSubnet", "AvailabilityZone" ] } ,
-        "CidrBlock": { "Fn::FindInMap" : [ "SubnetConfig", "Public", "CIDR" ]},
-          "Tags" : [
-            { "Key" : "Network", "Value" : "Public" },
-            { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
-          ]
-      }
-    },
-    "PrivateSubnet" : {
-      "Type" : "AWS::EC2::Subnet",
-      "Properties" : {
-          "VpcId" : { "Ref" : "Vpc" },
-          "CidrBlock" : { "Fn::FindInMap" : [ "SubnetConfig", "Private", "CIDR" ]},
-          "Tags" : [
-            { "Key" : "Network", "Value" : "Private" },
-            { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }}
-          ]
-      }
-    },
-    "NATGateway" : {
-      "Type" : "AWS::EC2::NatGateway",
-      "DependsOn" : "GatewayToInternet",
-      "Properties" : {
-        "AllocationId" : {
-        "Fn::GetAtt" : [
-          "NATGatewayEIP",
-          "AllocationId"
-          ]
-        },
-      "SubnetId" : {
-        "Ref" : "PublicSubnet"
-        }
-      }
-    },
-    "PublicRouteTable" : {
-      "Type" : "AWS::EC2::RouteTable",
-      "Properties" : {
-        "VpcId" : { "Ref" : "Vpc" },
-        "Tags" : [
-          { "Key" : "Network", "Value" : "Public" },
-          { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" } }
-        ]
-      }
-    },
-    "PublicRoute" : {
-      "Type" : "AWS::EC2::Route",
-      "Properties" : {
-        "RouteTableId" : { "Ref" : "PublicRouteTable" },
-        "DestinationCidrBlock" : "0.0.0.0/0",
-        "GatewayId" : { "Ref" : "InternetGateway" }
-      }
-    },
-    "PublicSubnetRouteAssociation" : {
-      "Type" : "AWS::EC2::SubnetRouteTableAssociation",
-      "Properties" : {
-        "SubnetId" : { "Ref" : "PublicSubnet" },
-        "RouteTableId" : { "Ref" : "PublicRouteTable" }
-      }
-    },
-    "PrivateRouteTable" : {
-      "Type" : "AWS::EC2::RouteTable",
-      "Properties" : {
-        "VpcId" : { "Ref" : "Vpc" },
-        "Tags" : [
-          { "Key" : "Network", "Value" : "Private" },
-          { "Key" : "Name", "Value" : { "Ref" : "AWS::StackName" }}
-        ]
-      }
-    },        
-    "PrivateRoute" : {
-      "Type" : "AWS::EC2::Route",
-      "Properties" : {
-        "RouteTableId" : { "Ref" : "PrivateRouteTable" },
-        "DestinationCidrBlock" : "0.0.0.0/0",
-        "NatGatewayId" : { "Ref" : "NATGateway" }
-      }
-    },
-    "PrivateSubnetRouteAssociation" : {
-      "Type" : "AWS::EC2::SubnetRouteTableAssociation",
-      "Properties" : {
-        "SubnetId" : { "Ref" : "PrivateSubnet" },
-        "RouteTableId" : { "Ref" : "PrivateRouteTable" }
-      }
-    }            
-  },
-  "Outputs" : {
-    "AdminSSHSecurityGroup" : {
-      "Description" : "Security Group that restricts Inbound IPs to SSH into the Master",
-      "Value" : {
-        "Ref" : "AdminSSHSecurityGroup"
-      }
-    },
-    "MasterAutoScalingGroup" : {
-      "Description" : "Autoscaling Group that contains the Master Instance",
-      "Value" : {
-        "Ref" : "MasterAutoScalingGroup"
-      }
-    }          
-  }  
-}
diff --git a/tools/im2rec.py b/tools/im2rec.py
index eac8a06fb7ad..c7156ce2e837 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -83,7 +83,15 @@ def read_list(path_in):
             if not line:
                 break
             line = [i.strip() for i in line.strip().split('\t')]
-            item = [int(line[0])] + [line[-1]] + [float(i) for i in line[1:-1]]
+            line_len = len(line)
+            if line_len < 3:
+                print('lst should at least has three parts, but only has %s parts for %s' %(line_len, line))
+                continue
+            try:
+                item = [int(line[0])] + [line[-1]] + [float(i) for i in line[1:-1]]
+            except Exception, e:
+                print('Parsing lst met error for %s, detail: %s' %(line, e))
+                continue
             yield item
 
 def image_encode(args, i, item, q_out):
diff --git a/tools/kill-mxnet.py b/tools/kill-mxnet.py
index c586643f76fa..2bdf949893b0 100644
--- a/tools/kill-mxnet.py
+++ b/tools/kill-mxnet.py
@@ -1,35 +1,36 @@
 #!/usr/bin/env python
 
 import os, sys
+import subprocess
 
-if len(sys.argv) != 2:
-  print "usage: %s <hostfile>" % sys.argv[0]
+if len(sys.argv) != 4:
+  print "usage: %s <hostfile> <user> <prog>" % sys.argv[0]
   sys.exit(1)
 
 host_file = sys.argv[1]
-prog_name = "train_imagenet"
+user = sys.argv[2]
+prog_name = sys.argv[3]
 
-# Get host IPs
-with open(host_file, "r") as f:
-  hosts = f.read().splitlines()
-ssh_cmd = (
-    "ssh "
-    "-o StrictHostKeyChecking=no "
-    "-o UserKnownHostsFile=/dev/null "
-    "-o LogLevel=quiet "
-    )
 kill_cmd = (
-    " "
-    "ps aux |"
-    "grep -v grep |"
-    "grep 'python train_imagenet.py' |"
-    "awk '{print \$2}'|"
-    "xargs kill"
+    "ps aux | "
+    "grep -v grep | "
+    "grep '" + prog_name + "' | "
+    "awk '{if($1==\"" + user + "\")print $2;}' | "
+    "xargs kill -9"
     )
 print kill_cmd
-for host in hosts:
-  cmd = ssh_cmd + host +" \""+ kill_cmd+"\""
-  print cmd
-  os.system(cmd)
 
-  print "Done killing"
+# Kill program on remote machines
+with open(host_file, "r") as f:
+  for host in f:
+    if ':' in host:
+      host = host[:host.index(':')]
+    print host
+    subprocess.Popen(["ssh", "-oStrictHostKeyChecking=no", "%s" % host, kill_cmd],
+            shell=False,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+    print "Done killing"
+
+# Kill program on local machine
+os.system(kill_cmd)